diff --git a/.devcontainer/library-scripts/common-debian.sh b/.devcontainer/library-scripts/common-debian.sh index 03ec6aa..41b7d93 100644 --- a/.devcontainer/library-scripts/common-debian.sh +++ b/.devcontainer/library-scripts/common-debian.sh @@ -112,8 +112,9 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then strace \ manpages \ manpages-dev \ - init-system-helpers" - + init-system-helpers \ + capnproto" + # Needed for adding manpages-posix and manpages-posix-dev which are non-free packages in Debian if [ "${ADD_NON_FREE_PACKAGES}" = "true" ]; then # Bring in variables from /etc/os-release like VERSION_CODENAME @@ -124,7 +125,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then sed -i -E "s/deb-src http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME}-updates main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME}-updates main contrib non-free/" /etc/apt/sources.list sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list sed -i "s/deb-src http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list - sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list + sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list sed -i "s/deb-src http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list # Handle bullseye location for security https://www.debian.org/releases/bullseye/amd64/release-notes/ch-information.en.html sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main contrib non-free/" /etc/apt/sources.list @@ -140,7 +141,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then if [[ ! -z $(apt-cache --names-only search ^libssl1.1$) ]]; then package_list="${package_list} libssl1.1" fi - + # Install appropriate version of libssl1.0.x if available libssl_package=$(dpkg-query -f '${db:Status-Abbrev}\t${binary:Package}\n' -W 'libssl1\.0\.?' 2>&1 || echo '') if [ "$(echo "$LIlibssl_packageBSSL" | grep -o 'libssl1\.0\.[0-9]:' | uniq | sort | wc -l)" -eq 0 ]; then @@ -155,7 +156,7 @@ if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then echo "Packages to verify are installed: ${package_list}" apt-get -y install --no-install-recommends ${package_list} 2> >( grep -v 'debconf: delaying package configuration, since apt-utils is not installed' >&2 ) - + # Install git if not already installed (may be more recent than distro version) if ! type git > /dev/null 2>&1; then apt-get -y install --no-install-recommends git @@ -174,7 +175,7 @@ fi # Ensure at least the en_US.UTF-8 UTF-8 locale is available. # Common need for both applications and things like the agnoster ZSH theme. if [ "${LOCALE_ALREADY_SET}" != "true" ] && ! grep -o -E '^\s*en_US.UTF-8\s+UTF-8' /etc/locale.gen > /dev/null; then - echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen + echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen locale-gen LOCALE_ALREADY_SET="true" fi @@ -183,12 +184,12 @@ fi group_name="${USERNAME}" if id -u ${USERNAME} > /dev/null 2>&1; then # User exists, update if needed - if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then + if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then group_name="$(id -gn $USERNAME)" groupmod --gid $USER_GID ${group_name} usermod --gid $USER_GID $USERNAME fi - if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then + if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then usermod --uid $USER_UID $USERNAME fi else @@ -198,7 +199,7 @@ else else groupadd --gid $USER_GID $USERNAME fi - if [ "${USER_UID}" = "automatic" ]; then + if [ "${USER_UID}" = "automatic" ]; then useradd -s /bin/bash --gid $USERNAME -m $USERNAME else useradd -s /bin/bash --uid $USER_UID --gid $USERNAME -m $USERNAME @@ -213,7 +214,7 @@ if [ "${USERNAME}" != "root" ] && [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}" fi # ** Shell customization section ** -if [ "${USERNAME}" = "root" ]; then +if [ "${USERNAME}" = "root" ]; then user_rc_path="/root" else user_rc_path="/home/${USERNAME}" @@ -250,9 +251,9 @@ fi # Set the default git editor if not already set if [ -z "$(git config --get core.editor)" ] && [ -z "${GIT_EDITOR}" ]; then if [ "${TERM_PROGRAM}" = "vscode" ]; then - if [[ -n $(command -v code-insiders) && -z $(command -v code) ]]; then + if [[ -n $(command -v code-insiders) && -z $(command -v code) ]]; then export GIT_EDITOR="code-insiders --wait" - else + else export GIT_EDITOR="code --wait" fi fi @@ -329,7 +330,7 @@ codespaces_zsh="$(cat \ # Codespaces zsh prompt theme __zsh_prompt() { local prompt_username - if [ ! -z "${GITHUB_USER}" ]; then + if [ ! -z "${GITHUB_USER}" ]; then prompt_username="@${GITHUB_USER}" else prompt_username="%n" diff --git a/.gitignore b/.gitignore index 0c8ae60..d29eed6 100644 --- a/.gitignore +++ b/.gitignore @@ -34,9 +34,14 @@ doc/internal/services/images/shepherd.png doc/internal/services/transport_agent/images/transport_agent.png doc/plantuml.jar external/external_deps/ +external/capnproto +external/pycapnp +src/release +src/include/capnp +src/include/kj +src/lib/message_defs.capnp.h src/*egg* -src/bin/dragon_hsta -src/build/dragon_hsta +src/bin/dragon-hsta src/dist/ src/doxygen/ src/dragon/dlogging/pydragon_logging.c @@ -47,11 +52,16 @@ src/dragon/launcher/pydragon_pmsgqueue.c src/dragon/pydragon_*.c src/dragon/transport/hsta/__init__.c src/dragon/transport/hsta/__main__.c -src/dragon/transport/hsta/crash-* src/dragon/transport/hsta/dragon_hsta -src/dragon/transport/hsta/hsta_dbg.*.out -src/dragon/transport/hsta/ideas.txt -src/dragon/transport/hsta/leak-* src/include/dragon/return_codes_map.h src/bin src/dragon/transport/hsta/dragon-hsta +src/lib/message_defs.capnp.c++ +src/include/dragon/message_defs.capnp.h +test/channels_subtests/test.out +src/dragon/infrastructure/message_defs.capnp +src/include/dragon/message_tcs.hpp +test/native/flimsgfrom +test/native/flimsgto +src/lib/_message_tcs.hpp +src/.dragon-config.mk diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..fbb4229 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "external/capnproto"] + path = external/capnproto + url = https://github.com/capnproto/capnproto.git diff --git a/README.rst b/README.rst index 1ce5484..a0c280b 100644 --- a/README.rst +++ b/README.rst @@ -65,49 +65,12 @@ If you wish to run multi-node or don't want to run in a container, you must set up your environment to run Dragon programs. Choose the version of Dragon to download that goes with your installed version of Python. Python 3.9+ is required to run Dragon. You must have Python installed and it must be in your path -somewhere. A common choice is to use a Python virtual environment, which can be -initialized from a base Python with: +somewhere. -.. code-block:: console - - python3 -m venv --clear _env - . _env/bin/activate - -The untarred distribution file contains several subdirectories. All provided commands -are relative to the directory that contains the README.rst. - -* The `dragon-*.whl` file must be pip3 installed once for your environment. - -.. code-block:: console - - pip3 install --force-reinstall dragon-0.8-cp39-cp39-linux_x86_64.whl - -* Check and possibly update your `PATH` environment variable to include the location of - pip installed console scripts, such as ~/.local/bin if you're not using a virtual environment. - -.. code-block:: console - - export PATH=~/.local/bin:${PATH} - -* You must set up the environment by loading the dragon module as follows. - -.. code-block:: console - - module use [/path to dragon-0.8]/modulefiles - module load dragon - -If you intend to use Dragon on your own Linux VM or an image that you -personally installed, you may need to enable module commands by adding the -following command to your ~/.bashrc or other login script. - -.. code-block:: console - - source /usr/share/modules/init/bash - -If you use a different shell, look in the `init` directory for a script for -your shell. - -You have completed the prerequisites for running Dragon with multiprocessing programs. +The untarred distribution file contains several subdirectories. Run the +./dragon-install file in that root directory to create a python virtual +environment and install two wheel files. For further details, follow the +instructions that you find in that README.md file in the distribution directory. Running Dragon ============== diff --git a/doc/cbook/ai-in-the-loop.rst b/doc/cbook/ai-in-the-loop.rst index 9daa463..d30c9fd 100755 --- a/doc/cbook/ai-in-the-loop.rst +++ b/doc/cbook/ai-in-the-loop.rst @@ -49,7 +49,7 @@ The code of the other files can be found in the release package, inside `example from itertools import count from model import Net, make_features, infer, train - from dragon.native.process import Process, TemplateProcess, Popen + from dragon.native.process import Process, ProcessTemplate, Popen from dragon.native.process_group import ProcessGroup from dragon.infrastructure.connection import Connection from dragon.native.machine import System @@ -112,12 +112,12 @@ The code of the other files can be found in the release package, inside `example grp = ProcessGroup(restart=False, pmi_enabled=True) # Pipe the stdout output from the head process to a Dragon connection - grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) + grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) # All other ranks should have their output go to DEVNULL grp.add_process( nproc=num_ranks - 1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), ) # start the process group grp.init() @@ -153,12 +153,12 @@ The code of the other files can be found in the release package, inside `example grp = ProcessGroup(restart=False, pmi_enabled=True) # Pipe the stdout output from the head process to a Dragon connection - grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) + grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) # All other ranks should have their output go to DEVNULL grp.add_process( nproc=num_ranks - 1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), ) # start the process group grp.init() diff --git a/doc/cbook/basic_pandarallel_demo.rst b/doc/cbook/basic_pandarallel_demo.rst index 4feb65e..2dddb4c 100644 --- a/doc/cbook/basic_pandarallel_demo.rst +++ b/doc/cbook/basic_pandarallel_demo.rst @@ -1,8 +1,8 @@ Basic Pandarallel Demonstration for Single Node Environment ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call. -It can be run with `dragon` and base multiprocessing to compare performance on your machine. +This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call. +It can be run with `dragon` and base multiprocessing to compare performance on your machine. The program demonstrates how to use `parallel_apply`, the multiprocessing verison of pandas `apply`, on a pandas dataframe with random input. @@ -12,4 +12,4 @@ The code demonstrates the following key concepts working with Dragon: * How to use pandarallel and pandas with Dragon and base multiprocessing * How pandarallel handles various dtypes -.. literalinclude:: ../../examples/jupyter/doc_ref/basic_pandarallel_demo.py +.. literalinclude:: ../../examples/jupyter/doc_ref/basic_pandarallel_demo.py \ No newline at end of file diff --git a/doc/cbook/bioinfo_alignment_pandarallel_demo.rst b/doc/cbook/bioinfo_alignment_pandarallel_demo.rst index 467038c..eea157f 100644 --- a/doc/cbook/bioinfo_alignment_pandarallel_demo.rst +++ b/doc/cbook/bioinfo_alignment_pandarallel_demo.rst @@ -24,7 +24,7 @@ The following notebook was used for the single-node comparison: .. literalinclude:: ../../examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py For the single-node run, both base multiprocessing and Dragon are compared. The runs utilized a single node with 2 AMD EPYC 7742 64-Core Processors with 128 cores. -Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware. +Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware. The timing for the base multiprocessing runtime is: @@ -71,10 +71,10 @@ The timing for the single-node Dragon runtime is: - - 27.174203 -For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores. -The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends -multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time. -Base multiprocessing does not support multi-node workloads. +For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores. +The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends +multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time. +Base multiprocessing does not support multi-node workloads. The following notebook was used for the multi-node comparison: diff --git a/doc/cbook/c_channels_demo.rst b/doc/cbook/c_channels_demo.rst index 7ba33c2..8462b59 100644 --- a/doc/cbook/c_channels_demo.rst +++ b/doc/cbook/c_channels_demo.rst @@ -200,7 +200,6 @@ captured by Dragon, any error messages are displayed back to the user. dragonMessage_t msg; char* send_ser_encoded; char* final_ser_encoded; - size_t send_ser_len; /* This function is necessary for off-node communication and relies on the * Dragon run-time services to supply gateway channels in the @@ -232,7 +231,7 @@ captured by Dragon, any error messages are displayed back to the user. * Dragon provides both base64 encoding and decoding for * interoperability between languages. */ - recv_chser.data = dragon_base64_decode(argv[3], strlen(argv[3]), &recv_chser.len); + recv_chser.data = dragon_base64_decode(argv[3], &recv_chser.len); /* With a valid serialized descriptor you can attach to a channel. This * attach here occurs on an off-node channel (except in the one node @@ -317,7 +316,7 @@ captured by Dragon, any error messages are displayed back to the user. return -1; } - send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len, &send_ser_len); + send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len); err = dragon_memory_pool_detach(&pool_descr); if (err != DRAGON_SUCCESS) { @@ -340,7 +339,7 @@ captured by Dragon, any error messages are displayed back to the user. send_ser_encoded = argv[4]; final_ser_encoded = argv[5]; - send_chser.data = dragon_base64_decode(send_ser_encoded, strlen(send_ser_encoded), &send_chser.len); + send_chser.data = dragon_base64_decode(send_ser_encoded, &send_chser.len); err = dragon_channel_attach(&send_chser, &send_ch); if (err != DRAGON_SUCCESS) { @@ -355,7 +354,7 @@ captured by Dragon, any error messages are displayed back to the user. return -1; } - final_chser.data = dragon_base64_decode(final_ser_encoded, strlen(final_ser_encoded), &final_chser.len); + final_chser.data = dragon_base64_decode(final_ser_encoded, &final_chser.len); err = dragon_channel_attach(&final_chser, &final_ch); if (err != DRAGON_SUCCESS) { diff --git a/doc/cbook/cbook.rst b/doc/cbook/cbook.rst index 7297574..ccd4f1e 100644 --- a/doc/cbook/cbook.rst +++ b/doc/cbook/cbook.rst @@ -41,6 +41,7 @@ Dragon Native c_channels_demo.rst dragon_native_queue.rst dragon_mpi_workflow.rst + dragon_native_policy_demo.rst Dragon Data (Preview) ===================== diff --git a/doc/cbook/distr-inf-telemetry.rst b/doc/cbook/distr-inf-telemetry.rst index 6e32321..53dc865 100644 --- a/doc/cbook/distr-inf-telemetry.rst +++ b/doc/cbook/distr-inf-telemetry.rst @@ -211,7 +211,7 @@ in :numref:`single-prompt-response`. .. figure:: images/llm-grafana-telem-data.jpg :scale: 60% :name: node-telemetry - + **Node telemetry data that is visualized using Grafana GUI and highlights the load balanced nature of this example** @@ -354,4 +354,4 @@ Description of the system used ============================== For this example, an HPE Cray EX was used. Each node has AMD EPYC 7763 64-core -CPUs and 4x Nvidia A100 GPUs. +CPUs and 4x Nvidia A100 GPUs. \ No newline at end of file diff --git a/doc/cbook/dragon_dict.rst b/doc/cbook/dragon_dict.rst index a2b67b0..6c9fc44 100644 --- a/doc/cbook/dragon_dict.rst +++ b/doc/cbook/dragon_dict.rst @@ -99,6 +99,6 @@ aggregated rate of opearations as the dictionary managers are spawned across the .. figure:: images/dragon_dict_results.png :align: center :scale: 25% - :name: multinode-results + :name: multinode-results **Results on a multi-node setup** diff --git a/doc/cbook/dragon_mpi_workflow.rst b/doc/cbook/dragon_mpi_workflow.rst index f73a31f..1333340 100644 --- a/doc/cbook/dragon_mpi_workflow.rst +++ b/doc/cbook/dragon_mpi_workflow.rst @@ -47,7 +47,7 @@ processes: from dragon.globalservices import node from dragon.globalservices.process import multi_join from dragon.infrastructure.connection import Connection - from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, TemplateProcess + from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, ProcessTemplate from dragon.native.process_group import ProcessGroup logging.basicConfig(level=logging.INFO) @@ -81,13 +81,13 @@ processes: # Pipe the stdout output from the head process to a Dragon connection grp.add_process( nproc=1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE) + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE) ) # All other ranks should have their output go to DEVNULL grp.add_process( nproc=num_ranks-1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL) + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL) ) grp.init() diff --git a/doc/cbook/dragon_native_policy_demo.rst b/doc/cbook/dragon_native_policy_demo.rst new file mode 100644 index 0000000..bc2397c --- /dev/null +++ b/doc/cbook/dragon_native_policy_demo.rst @@ -0,0 +1,75 @@ +Using Dragon policies to control placement and resources for processes +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +This example shows how policies can be passed and applied to processes that are started from a process group. +Policies can be applied to the whole group as well as to individual processes. +In this example, we apply a group policy that restricts the cpu affinity of all processes that are part of the group. +A policy is then applied in batches to processes that are part of the group that restrict the placement of the processes to specific nodes. +To demonstrate this restricted placement, we launch an MPI program, `mpi_hello`, that returns the hostname that it is running on along with its local process ID and its rank within the group. + +Note, if the group policy and process policy conflict, an error is not raised. +Instead, we resolve conflicts based on the following hierarchy: process policies > group policies > global policy. + +This example consists of the following files: + +* `policy_demo.py` - This is the main file. It defines the policies and process group, launches the group, and then parses the output from the ranks before printing the output. + +* `mpi_hello.c` - This file contains a simple MPI program that prints the hostname, pid, and rank within the MPI group. + +Below, we present the main python code (`policy_demo.py`) which acts as the coordinator of the workflow. +The code of the other files can be found in the release package, inside `examples/dragon_native/mpi` directory. + + +.. literalinclude:: ../../examples/dragon_native/mpi/policy_demo.py + :language: python + +How to run +========== + +Example Output when run on 4 nodes with 8 AMD GPUs per node +------------------------------------------------------------------------------------- + +.. code-block:: console + :linenos: + + > make + gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib -c mpi_hello.c -o mpi_hello.c.o + gcc -lm -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib -lmpich mpi_hello.c.o -o mpi_hello + > salloc --nodes=4 --exclusive + > dragon policy_demo.py + Using 2 of 4 + pinoak0015 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] + pinoak0016 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] + pinoak0014 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] + pinoak0013 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] + 4294967298 returned output: Hello world from pid 57645, processor pinoak0015, rank 0 out of 16 processors + + 4294967299 returned output: Hello world from pid 57646, processor pinoak0015, rank 1 out of 16 processors + + 4294967300 returned output: Hello world from pid 57647, processor pinoak0015, rank 2 out of 16 processors + + 4294967301 returned output: Hello world from pid 57648, processor pinoak0015, rank 3 out of 16 processors + + 4294967302 returned output: Hello world from pid 57649, processor pinoak0015, rank 4 out of 16 processors + + 4294967303 returned output: Hello world from pid 57650, processor pinoak0015, rank 5 out of 16 processors + + 4294967304 returned output: Hello world from pid 57651, processor pinoak0015, rank 6 out of 16 processors + + 4294967305 returned output: Hello world from pid 57652, processor pinoak0015, rank 7 out of 16 processors + + 4294967306 returned output: Hello world from pid 56247, processor pinoak0016, rank 8 out of 16 processors + + 4294967307 returned output: Hello world from pid 56248, processor pinoak0016, rank 9 out of 16 processors + + 4294967308 returned output: Hello world from pid 56249, processor pinoak0016, rank 10 out of 16 processors + + 4294967309 returned output: Hello world from pid 56250, processor pinoak0016, rank 11 out of 16 processors + + 4294967310 returned output: Hello world from pid 56251, processor pinoak0016, rank 12 out of 16 processors + + 4294967311 returned output: Hello world from pid 56252, processor pinoak0016, rank 13 out of 16 processors + + 4294967312 returned output: Hello world from pid 56253, processor pinoak0016, rank 14 out of 16 processors + + 4294967313 returned output: Hello world from pid 56254, processor pinoak0016, rank 15 out of 16 processors diff --git a/doc/cbook/images/sharedstate_pascal_triangle.jpg b/doc/cbook/images/sharedstate_pascal_triangle.jpg new file mode 100644 index 0000000..df5a16f Binary files /dev/null and b/doc/cbook/images/sharedstate_pascal_triangle.jpg differ diff --git a/doc/cbook/shared_state_pascal_triangle.rst b/doc/cbook/shared_state_pascal_triangle.rst new file mode 100644 index 0000000..715cc81 --- /dev/null +++ b/doc/cbook/shared_state_pascal_triangle.rst @@ -0,0 +1,34 @@ +Shared State Example Utilizing Pascal Triangle +================================================= + +In this example, we demonstrate that the Dragon multiprocessing interface can be used to create a simple shared state example. We utilize the Pascal triangle which is a method for calcuating coefficients for binomial expansions. +The manager multiprocessing process and the client multiprocessing process communicate via a shared state spawned by the context multiprocessing process. +The main multiprocessing process will start the manager and client multiprocessing processes. The manager process finds the sum of the Pascal triangle array calcualated by the client process. +The third multiprocessing process spawned by the context class finds when the Pascal triangle has been completed. +The shared state that contains the Pascal triangle array and the Pascal triangle sum is guarded by a lock; only the process that accesses the lock may alter the array and value. + +:numref:`sharedstate_pascal_triangle` presents the code flow for the manager-client-shared state communication utilizing a common queue. + +.. figure:: images/sharedstate_pascal_triangle.jpg + :scale: 10% + :name: sharedstate_pascal_triangle + + **Example Pascal-Triangle SharedState Program* + +How to run +========== + +The following code shows how a shared state passed between the manager, client, and context Dragon multiprocessing processes can be used to compute all the elements of the Pascal triangle +and the sum of all the elements in the Pascal triangle given the number of rows of the Pascal triangle of interest: + +.. literalinclude:: ../../examples/multiprocessing/shared_state_pascal_triangle.py + +The following output is expected when the user provides the row input of 5: + +.. code-block:: console + :linenos: + + > dragon shared_state_pascal_triangle.py --rows 5 + Pascal Triangle Array Calculated for 5 rows from the Pascal row of 0 to the Pascal row of 5 , and the associated sum of the Pascal triangle array. + Pascal Triangle Array [1, 1, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 4, 6, 4, 1] + Pascal Triangle Sum: 32 \ No newline at end of file diff --git a/doc/components/broadcast.rst b/doc/components/broadcast.rst index c37af98..02bacbb 100644 --- a/doc/components/broadcast.rst +++ b/doc/components/broadcast.rst @@ -13,8 +13,8 @@ call to a trigger function. The payload is optional. The BCast object provides a synchronization/communication structure. .. figure:: images/bcast.png - :name: bcast-any-to-many - + :name: bcast-any-to-many + **An Any to Many Broadcast Synchronization Object** A BCast object is meant to be shared by multiple threads/processes. The object is first created by a process. @@ -49,7 +49,7 @@ Triggering processes may trigger one or all processes that are waiting on a BCas .. figure:: images/bcastflow.srms1.png :scale: 75% - :name: ops-on-bcast + :name: ops-on-bcast **Operations on a BCast Object** diff --git a/doc/components/managed_memory/bitset.rst b/doc/components/managed_memory/bitset.rst index ab4926d..4ffe9d4 100644 --- a/doc/components/managed_memory/bitset.rst +++ b/doc/components/managed_memory/bitset.rst @@ -18,7 +18,7 @@ the functions. .. code-block:: C :linenos: :caption: **A BitSet Example** - :name: bitset-example + :name: bitset-example size_t bitsetsize; dragonBitSetErr_t brc; @@ -63,7 +63,7 @@ be displayed. .. code-block:: text :caption: **BitSet Example Output** - :name: bitset-example-output + :name: bitset-example-output That was a one A Bit Dump @@ -91,7 +91,7 @@ API accesses the BitSet. The handle structure is given in :numref:`bitset-handle .. code-block:: C :caption: **BitSet Handle Definition** - :name: bitset-handle-def + :name: bitset-handle-def typedef struct dragonBitSet_st { size_t size; diff --git a/doc/components/managed_memory/heapmanager.rst b/doc/components/managed_memory/heapmanager.rst index 1d150e3..2179d89 100644 --- a/doc/components/managed_memory/heapmanager.rst +++ b/doc/components/managed_memory/heapmanager.rst @@ -21,7 +21,7 @@ An Example of Malloc and Free ============================= .. figure:: images/heapallocations.png - :name: heap-allocations + :name: heap-allocations **A Sample Heap with Allocations** @@ -55,7 +55,7 @@ size. .. code-block:: C :linenos: :caption: **Heap Initialization** - :name: heap-init + :name: heap-init // make a heap of size 1K with 32 byte segments as minimum block size. How much space // is required? This call determines how much space is required for a heap with @@ -99,7 +99,7 @@ The algorithm doesn't consider anything further, but because segment 0 is in a b block is part of a block of 64 bytes, they could not be joined either (at this point anyway). .. figure:: images/heapfree1.png - :name: heap-free-green + :name: heap-free-green **After Freeing the Green Block** @@ -112,7 +112,7 @@ are three free blocks that are available in the heap. The segment 1 is a 32 byte make up a 64 byte free block. Finally, the segments 4-6 make up a 128 byte free block. .. figure:: images/heapfree2.png - :name: heap-free-purple + :name: heap-free-purple **After Freeing the Purple Block** @@ -123,7 +123,7 @@ The 512 byte block starting at segment 16 is freed next and results in once agai segment 0. Again, segment 0 is not free and no further joining of blocks is possible. .. figure:: images/heapfree3.png - :name: heap-free-yellow + :name: heap-free-yellow **After Freeing the Yellow Block** @@ -142,7 +142,7 @@ At this point there are two free blocks: a 256 byte block starting at segment 0 at segment 16. .. figure:: images/heapfree4.png - :name: heap-free-orange + :name: heap-free-orange **After Freeing the Orange Block** @@ -155,7 +155,7 @@ but since its buddy is also free and the same size, the two 512 byte blocks are block. .. figure:: images/heapfree5.png - :name: heap-free-maroon + :name: heap-free-maroon **After Freeing the Maroon Block** @@ -168,7 +168,7 @@ two bit sets, the block set and the free set. There is also a lock associated wi multi-processing compatible. .. figure:: images/metadata.png - :name: metadata + :name: metadata **Meta-Data and Handle Structure** diff --git a/doc/components/managed_memory/hexdump.rst b/doc/components/managed_memory/hexdump.rst index 1dc9a2e..0a478b1 100644 --- a/doc/components/managed_memory/hexdump.rst +++ b/doc/components/managed_memory/hexdump.rst @@ -21,7 +21,7 @@ an indentation string. .. code-block:: C :caption: **Hex Dump Example Code** - :name: hex-dump-example-code + :name: hex-dump-example-code hex_dump_to_fd(fd, "BITS",(void*)set->data,num_bytes,indent); @@ -32,7 +32,7 @@ string to print before each line of the dump. .. code-block:: text :caption: **Hex Dump Sample Output** - :name: hex-dump-example-output + :name: hex-dump-example-output * BITS: * 00007FCF60C97070 80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ diff --git a/doc/components/managed_memory/managed_memory.rst b/doc/components/managed_memory/managed_memory.rst index b94905a..0c9bf75 100644 --- a/doc/components/managed_memory/managed_memory.rst +++ b/doc/components/managed_memory/managed_memory.rst @@ -31,7 +31,7 @@ Architecture ============ .. figure:: images/managed_memory.svg - :name: managed-mem-arch + :name: managed-mem-arch **Architecture of the Managed Memory component** diff --git a/doc/components/scalable_locks/scalable_locks.rst.needs_work b/doc/components/scalable_locks/scalable_locks.rst.needs_work index cc80e88..284c885 100644 --- a/doc/components/scalable_locks/scalable_locks.rst.needs_work +++ b/doc/components/scalable_locks/scalable_locks.rst.needs_work @@ -21,7 +21,7 @@ Architecture ============ .. figure:: images/scalable_locks.svg - :name: scalable-locks + :name: scalable-locks **Architecture of the Scalable Locks component** @@ -252,14 +252,14 @@ Functions Attach to the FIFO-style lock previously mapped into the memory pointed to by *prt* and return a :c:type:`dragonFIFOLock_t` handle to the lock. - + Returns ``DRAGON_SUCCESS`` or an error code. .. c:function:: dragonError_t dragon_greedy_lock_attach(dragonGreedyLock_t * dlock, void * ptr) Attach to the greedy-style lock previously mapped into the memory pointed to by *prt* and return a :c:type:`dragonGreedyLock_t` handle to the lock. - + Returns ``DRAGON_SUCCESS`` or an error code. .. c:function:: dragonError_t dragon_fifo_lock_detach(dragonFIFOLock_t * dlock) @@ -435,12 +435,12 @@ Operational Functions Waits for a write lock on the *RWLock* to become available. This will occur when all readers have released their locks by calling *dragon_rwlock_read_unlock*. Once the write lock is acquired, all readers wait until the write lock is released by calling *dragon_rwlock_write_unlock*. - + Returns DRAGON_SUCCESS, TBD. - + .. c:function:: dragonError_t dragon_rwlock_write_unlock(dragonRWLock_t * dlock) Releases the acquired write lock on the *RWLock* referred to by *dlock*. Any waiting readers or writers will be able to proceed. No guarantee is made for which will proceed first. - + Returns DRAGON_SUCCESS, TBD. \ No newline at end of file diff --git a/doc/conf.py b/doc/conf.py index 286f304..d94993c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -24,7 +24,7 @@ # -- Project information ----------------------------------------------------- project = "Dragon" -DragonVersion = "0.8" +DragonVersion = "0.9" copyright = "2024, Hewlett Packard Enterprise" author = "Michael Burke, Eric Cozzi, Zach Crisler, Julius Donnert, Veena Ghorakavi, Nick Hill, Maria Kalantzi, Ben Keen, Kent D. Lee, Pete Mendygral, Davin Potts and Nick Radcliffe" @@ -121,5 +121,19 @@ def can_document_member(cls, member: Any, membername: str, isattr: bool, parent: def add_directive_header(self, sig): pass +autodoc_default_flags = ['members', 'private-members', 'special-members', + #'undoc-members', + 'show-inheritance'] + +def autodoc_skip_member(app, what, name, obj, skip, options): + # Ref: https://stackoverflow.com/a/21449475/ + exclusions = ('__weakref__', '__new__', # special-members + '__doc__', '__module__', '__dict__', # undoc-members + ) + exclude = name in exclusions + # return True if (skip or exclude) else None # Can interfere with subsequent skip functions. + return True if exclude else None + def setup(app): + app.connect('autodoc-skip-member', autodoc_skip_member) app.add_autodocumenter(AutoDocstringOnly) diff --git a/doc/infrastructure/architecture.rst b/doc/infrastructure/architecture.rst index 03211a0..c900fff 100644 --- a/doc/infrastructure/architecture.rst +++ b/doc/infrastructure/architecture.rst @@ -14,7 +14,7 @@ not work in a distributed system. While very portable across time and operating descriptor based approach doesn't offer the best performance and scalability. .. figure:: images/infrastructure.svg - :name: infra-schematic + :name: infra-schematic **Dragon Runtime Architecture in a multi-node deployment** diff --git a/doc/infrastructure/images/.gitignore b/doc/infrastructure/images/.gitignore index f4af5af..244cfa3 100644 --- a/doc/infrastructure/images/.gitignore +++ b/doc/infrastructure/images/.gitignore @@ -1 +1 @@ -*.srms*.png +*.srms*.png \ No newline at end of file diff --git a/doc/infrastructure/infrastructure.rst b/doc/infrastructure/infrastructure.rst index bd8d77e..44d2129 100644 --- a/doc/infrastructure/infrastructure.rst +++ b/doc/infrastructure/infrastructure.rst @@ -15,4 +15,4 @@ Infrastructure bootstrapping.rst logging.rst policy.rst - overlay_network.rst + overlay_network.rst \ No newline at end of file diff --git a/doc/infrastructure/messages_api.rst b/doc/infrastructure/messages_api.rst index 215febe..6eccfe6 100644 --- a/doc/infrastructure/messages_api.rst +++ b/doc/infrastructure/messages_api.rst @@ -150,7 +150,7 @@ GS Process Messages 1. **GSProcessCreate** *type enum* - GS_PROCESS_CREATE (= 1) + GS_PROCESS_CREATE *purpose* Request to global services to create a new managed process. @@ -207,7 +207,7 @@ GS Process Messages 2. **GSProcessCreateResponse** *type enum* - GS_PROCESS_CREATE_RESPONSE (= 2) + GS_PROCESS_CREATE_RESPONSE *purpose* Response to process creation request. @@ -252,7 +252,7 @@ GS Process Messages 3. **GSProcessList** *type enum* - GS_PROCESS_LIST (= 3) + GS_PROCESS_LIST *purpose* Return a list of the p_uid for all the processes @@ -276,7 +276,7 @@ GS Process Messages 4. **GSProcessListResponse** *type enum* - GS_PROCESS_LIST_RESPONSE (= 4) + GS_PROCESS_LIST_RESPONSE *purpose* Responds with a list of the p_uid for all the @@ -299,7 +299,7 @@ GS Process Messages 5. **GSProcessQuery** *type enum* - GS_PROCESS_QUERY (= 5) + GS_PROCESS_QUERY *purpose* Request the ProcessDescriptor for a managed process @@ -330,7 +330,7 @@ GS Process Messages 6. **GSProcessQueryResponse** *type enum* - GS_PROCESS_QUERY_RESPONSE (= 6) + GS_PROCESS_QUERY_RESPONSE *purpose* Response to request for ProcessDescriptor for a managed process @@ -366,7 +366,7 @@ GS Process Messages 7. **GSProcessKill** *type enum* - GS_PROCESS_KILL (= 7) + GS_PROCESS_KILL *purpose* Request a managed process get killed @@ -402,7 +402,7 @@ GS Process Messages 8. **GSProcessKillResponse** *type enum* - GS_PROCESS_KILL_RESPONSE (= 8) + GS_PROCESS_KILL_RESPONSE *purpose* Response to GSProcessKill message @@ -449,7 +449,7 @@ GS Process Messages 9. **GSProcessJoin** *type enum* - GS_PROCESS_JOIN (= 9) + GS_PROCESS_JOIN *purpose* Request notification when a given process exits @@ -486,7 +486,7 @@ GS Process Messages 10. **GSProcessJoinResponse** *type enum* - GS_PROCESS_JOIN_RESPONSE (= 10) + GS_PROCESS_JOIN_RESPONSE *purpose* Response to request for notification when a process exits. @@ -528,7 +528,7 @@ GS Process Messages 95. **GSProcessJoinList** *type enum* - GS_PROCESS_JOIN_LIST (= 95) + GS_PROCESS_JOIN_LIST *purpose* Request notification when any/all process from a given list exits. @@ -569,7 +569,7 @@ GS Process Messages 96. **GSProcessJoinListResponse** *type enum* - GS_PROCESS_JOIN_LIST_RESPONSE (= 96) + GS_PROCESS_JOIN_LIST_RESPONSE *purpose* Response to request for notification when any/all process from a given list exits. @@ -640,7 +640,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 71. **GSPoolCreate** *type enum* - GS_POOL_CREATE (= 71) + GS_POOL_CREATE *purpose* Requests that a new user memory pool be created. @@ -682,7 +682,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 72. **GSPoolCreateResponse** *type enum* - GS_POOL_CREATE_RESPONSE (= 72) + GS_POOL_CREATE_RESPONSE *purpose* Response to request for a pool creation @@ -739,7 +739,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 73. **GSPoolDestroy** *type enum* - GS_POOL_DESTROY (= 73) + GS_POOL_DESTROY *purpose* Request destruction of a managed memory pool. @@ -772,7 +772,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 74. **GSPoolDestroyResponse** *type enum* - GS_POOL_DESTROY_RESPONSE (= 74) + GS_POOL_DESTROY_RESPONSE *purpose* Response to GSPoolDestroy message @@ -826,7 +826,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 75. **GSPoolList** *type enum* - GS_POOL_LIST (= 75) + GS_POOL_LIST *purpose* Return a list of tuples of ``m_uid`` for all pools currently alive. @@ -850,7 +850,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 76. **GSPoolListResponse** *type enum* - GS_POOL_LIST_RESPONSE (= 76) + GS_POOL_LIST_RESPONSE *purpose* Responds with a list of ``m_uid`` for all the @@ -874,7 +874,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 77. **GSPoolQuery** *type enum* - GS_POOL_QUERY (= 77) + GS_POOL_QUERY *purpose* Request the PoolDescriptor for a managed memory pool. @@ -905,7 +905,7 @@ ID number, the 'memory UID' normally abbreviated as ``m_uid``. 78. **GSPoolQueryResponse** *type enum* - GS_POOL_QUERY_RESPONSE (= 78) + GS_POOL_QUERY_RESPONSE *purpose* Response to request for PoolDescriptor for a managed memory pool. This object @@ -954,7 +954,7 @@ which itself has its own ``m_uid``. 11. **GSChannelCreate** *type enum* - GS_CHANNEL_CREATE (= 11) + GS_CHANNEL_CREATE *purpose* Requests that a new channel be created. @@ -993,7 +993,7 @@ which itself has its own ``m_uid``. 12. **GSChannelCreateResponse** *type enum* - GS_CHANNEL_CREATE_RESPONSE (= 12) + GS_CHANNEL_CREATE_RESPONSE *purpose* Response to channel creation request. @@ -1037,7 +1037,7 @@ which itself has its own ``m_uid``. 13. **GSChannelList** *type enum* - GS_CHANNEL_LIST (= 13) + GS_CHANNEL_LIST *purpose* Request list of currently active channels. @@ -1061,7 +1061,7 @@ which itself has its own ``m_uid``. 14. **GSChannelListResponse** *type enum* - GS_CHANNEL_LIST_RESPONSE (= 14) + GS_CHANNEL_LIST_RESPONSE *purpose* Response to request to list of currently active channels @@ -1087,7 +1087,7 @@ which itself has its own ``m_uid``. 15. **GSChannelQuery** *type enum* - GS_CHANNEL_QUERY (= 15) + GS_CHANNEL_QUERY *purpose* Request the descriptor for an already created channel by @@ -1124,7 +1124,7 @@ which itself has its own ``m_uid``. 16. **GSChannelQueryResponse** *type enum* - GS_CHANNEL_QUERY_RESPONSE (= 16) + GS_CHANNEL_QUERY_RESPONSE *purpose* Response to request for a channel descriptor @@ -1155,7 +1155,7 @@ which itself has its own ``m_uid``. 17. **GSChannelDestroy** *type enum* - GS_CHANNEL_DESTROY (= 17) + GS_CHANNEL_DESTROY *purpose* Request that a channel be destroyed or a refcount on the channel be released. @@ -1197,7 +1197,7 @@ which itself has its own ``m_uid``. 18. **GSChannelDestroyResponse** *type enum* - GS_CHANNEL_DESTROY_RESPONSE (= 18) + GS_CHANNEL_DESTROY_RESPONSE *purpose* Response to request to destroy a channel. @@ -1248,7 +1248,7 @@ which itself has its own ``m_uid``. and its response and ride all this on top of GSChannelQuery by adding a timeout. *type enum* - GS_CHANNEL_JOIN (= 19) + GS_CHANNEL_JOIN *purpose* Sometimes two processes want to communicate through a channel but we @@ -1278,7 +1278,7 @@ which itself has its own ``m_uid``. *type enum* - GS_CHANNEL_JOIN_RESPONSE (= 20) + GS_CHANNEL_JOIN_RESPONSE *purpose* Response to request to join a channel @@ -1318,7 +1318,7 @@ which itself has its own ``m_uid``. handles currently acquired. *type enum* - GS_CHANNEL_DETACH (= 21) + GS_CHANNEL_DETACH *purpose* Request local detachment from channel. @@ -1345,7 +1345,7 @@ which itself has its own ``m_uid``. PLACEHOLDER *type enum* - GS_CHANNEL_DETACH_RESPONSE (= 22) + GS_CHANNEL_DETACH_RESPONSE *purpose* Response to request to detach from a channel. @@ -1382,7 +1382,7 @@ which itself has its own ``m_uid``. PLACEHOLDER *type enum* - GS_CHANNEL_GET_SENDH (= 23) + GS_CHANNEL_GET_SENDH *purpose* Request send handle for channel @@ -1410,7 +1410,7 @@ which itself has its own ``m_uid``. PLACEHOLDER *type enum* - GS_CHANNEL_GET_SENDH_RESPONSE (= 24) + GS_CHANNEL_GET_SENDH_RESPONSE *purpose* Response to request to get a channel send handle @@ -1458,7 +1458,7 @@ which itself has its own ``m_uid``. PLACEHOLDER *type enum* - GS_CHANNEL_GET_RECVH (= 25) + GS_CHANNEL_GET_RECVH *purpose* Request recv handle for channel @@ -1486,7 +1486,7 @@ which itself has its own ``m_uid``. PLACEHOLDER *type enum* - GS_CHANNEL_GET_RECVH_RESPONSE (= 26) + GS_CHANNEL_GET_RECVH_RESPONSE *purpose* Response to request to get a channel recv handle @@ -1536,7 +1536,7 @@ querying the number of cpus in the system. 102. **GSNodeList** *type enum* - GS_NODE_LIST (= 102) + GS_NODE_LIST *purpose* Return a list of tuples of ``h_uid`` for all nodes currently registered. @@ -1560,7 +1560,7 @@ querying the number of cpus in the system. 103. **GSNodeListResponse** *type enum* - GS_NODE_LIST_RESPONSE (= 103) + GS_NODE_LIST_RESPONSE *purpose* Responds with a list of ``h_uid`` for all the @@ -1583,7 +1583,7 @@ querying the number of cpus in the system. 97. **GSNodeQuery** *type enum* - GS_NODE_QUERY (= 97) + GS_NODE_QUERY *purpose* Ask Global Services for a node descriptor of the hardware Dragon @@ -1611,7 +1611,7 @@ querying the number of cpus in the system. 98. **GSNodeQueryResponse** *type enum* - GS_NODE_QUERY_RESPONSE (= 98) + GS_NODE_QUERY_RESPONSE *purpose* Return the machine descriptor after a GSNodeQuery. @@ -1640,7 +1640,7 @@ querying the number of cpus in the system. 104. **GSNodeQueryTotalCPUCount** *type enum* - GS_NODE_QUERY_TOTAL_CPU_COUNT (= 104) + GS_NODE_QUERY_TOTAL_CPU_COUNT *purpose* Asks GS to return the total number of CPUS beloging to all of the registered nodes. @@ -1658,7 +1658,7 @@ querying the number of cpus in the system. 105. **GSNodeQueryTotalCPUCountResponse** *type enum* - GS_NODE_QUERY_TOTAL_CPU_COUNT_RESPONSE (= 105) + GS_NODE_QUERY_TOTAL_CPU_COUNT_RESPONSE *purpose* Return the total number of CPUS beloging to all of the registered nodes. @@ -1689,7 +1689,7 @@ querying the number of cpus in the system. 121. **GSGroupDestroy** *type enum* - GS_GROUP_DESTROY (= 121) + GS_GROUP_DESTROY *purpose* Ask Global Services to destroy a group of resources. This means destroy all the member-resources, as well as the container/group. @@ -1714,7 +1714,7 @@ querying the number of cpus in the system. 122. **GSGroupDestroyResponse** *type enum* - GS_GROUP_DESTROY_RESPONSE (= 122) + GS_GROUP_DESTROY_RESPONSE *purpose* Response to GSGroupDestroy message @@ -1758,7 +1758,7 @@ querying the number of cpus in the system. 123. **GSGroupAddTo** *type enum* - GS_GROUP_DESTROY (= 123) + GS_GROUP_DESTROY *purpose* Ask Global Services to add specific resources to an existing group of resources. The resources are already created. @@ -1787,7 +1787,7 @@ querying the number of cpus in the system. 124. **GSGroupAddToResponse** *type enum* - GS_GROUP_ADD_TO_RESPONSE (= 124) + GS_GROUP_ADD_TO_RESPONSE *purpose* Response to GSGroupAddTo message @@ -1838,7 +1838,7 @@ querying the number of cpus in the system. 125. **GSGroupRemoveFrom** *type enum* - GS_GROUP_REMOVE_FROM (= 125) + GS_GROUP_REMOVE_FROM *purpose* Ask Global Services to remove specific resources from an existing group of resources. @@ -1867,7 +1867,7 @@ querying the number of cpus in the system. 126. **GSGroupRemoveFromResponse** *type enum* - GS_GROUP_REMOVE_FROM_RESPONSE (= 126) + GS_GROUP_REMOVE_FROM_RESPONSE *purpose* Response to GSGroupRemoveFrom message @@ -1918,7 +1918,7 @@ querying the number of cpus in the system. 127. **GSGroupCreate** *type enum* - GS_GROUP_CREATE (= 127) + GS_GROUP_CREATE *purpose* Ask Global Services to create a group of resources. @@ -1948,7 +1948,7 @@ querying the number of cpus in the system. 128. **GSGroupCreateResponse** *type enum* - GS_GROUP_CREATE_RESPONSE (= 128) + GS_GROUP_CREATE_RESPONSE *purpose* Response to GSGroupCreate message @@ -1969,7 +1969,7 @@ querying the number of cpus in the system. 129. **GSGroupKill** *type enum* - GS_GROUP_KILL (= 129) + GS_GROUP_KILL *purpose* Ask Global Services to send the processes belonging to a specified group a specified signal. @@ -1998,7 +1998,7 @@ querying the number of cpus in the system. 130. **GSGroupKillResponse** *type enum* - GS_GROUP_KILL_RESPONSE (= 130) + GS_GROUP_KILL_RESPONSE *purpose* Response to GSGroupKill message @@ -2049,7 +2049,7 @@ querying the number of cpus in the system. 131. **GSGroupCreateAddTo** *type enum* - GS_GROUP_CREATE_ADD_TO (= 131) + GS_GROUP_CREATE_ADD_TO *purpose* Ask Global Services to create and add resources to an existing group of resources. @@ -2081,7 +2081,7 @@ querying the number of cpus in the system. 132. **GSGroupCreateAddToResponse** *type enum* - GS_GROUP_CREATE_ADD_TO_RESPONSE (= 132) + GS_GROUP_CREATE_ADD_TO_RESPONSE *purpose* Response to GSGroupCreateAddTo message @@ -2125,7 +2125,7 @@ querying the number of cpus in the system. 133. **GSGroupDestroyRemoveFrom** *type enum* - GS_GROUP_DESTROY_REMOVE_FROM (= 133) + GS_GROUP_DESTROY_REMOVE_FROM *purpose* Ask Global Services to destroy and remove specific resources from an existing group of resources. @@ -2154,7 +2154,7 @@ querying the number of cpus in the system. 134. **GSGroupDestroyRemoveFromResponse** *type enum* - GS_GROUP_REMOVE_FROM_RESPONSE (= 134) + GS_GROUP_REMOVE_FROM_RESPONSE *purpose* Response to GSGroupDestroyRemoveFrom message @@ -2205,7 +2205,7 @@ querying the number of cpus in the system. 117. **GSGroupList** *type enum* - GS_GROUP_LIST (= 117) + GS_GROUP_LIST *purpose* Request a list of the g_uid for all the groups of resources @@ -2229,7 +2229,7 @@ querying the number of cpus in the system. 118. **GSGroupListResponse** *type enum* - GS_GROUP_LIST_RESPONSE (= 118) + GS_GROUP_LIST_RESPONSE *purpose* Response to GSGroupList message @@ -2248,7 +2248,7 @@ querying the number of cpus in the system. 119. **GSGroupQuery** *type enum* - GS_GROUP_QUERY (= 119) + GS_GROUP_QUERY *purpose* Request the GroupDescriptor for a managed group of resources @@ -2279,7 +2279,7 @@ querying the number of cpus in the system. 120. **GSGroupQueryResponse** *type enum* - GS_GROUP_QUERY_RESPONSE (= 120) + GS_GROUP_QUERY_RESPONSE *purpose* Response to request for GroupDescriptor for a managed group @@ -2321,7 +2321,7 @@ for example ones related to sequencing runtime startup and teardown. 27. **AbnormalTermination** *type enum* - ABNORMAL_TERMINATION (= 27) + ABNORMAL_TERMINATION *purpose* Error result for startup and teardown messages, as well @@ -2347,7 +2347,7 @@ for example ones related to sequencing runtime startup and teardown. 28. **GSStarted** *type enum* - GS_STARTED (= 28) + GS_STARTED *purpose* Confirm to Launcher that the Global Services process (and if @@ -2364,7 +2364,7 @@ for example ones related to sequencing runtime startup and teardown. 29. **GSPingSH** *type enum* - GS_PING_SH (= 29) + GS_PING_SH *purpose* Confirm to Shepherd(s) that Global Services has started and @@ -2383,7 +2383,7 @@ for example ones related to sequencing runtime startup and teardown. 30. **GSIsUp** *type enum* - GS_IS_UP (= 30) + GS_IS_UP *purpose* Confirm to Launcher that Global Services is completely up @@ -2398,7 +2398,7 @@ for example ones related to sequencing runtime startup and teardown. 31. **GSHeadExit** *type enum* - GS_HEAD_EXIT (= 31) + GS_HEAD_EXIT *purpose* Notify Launcher that the head process has exited. At this point @@ -2416,7 +2416,7 @@ for example ones related to sequencing runtime startup and teardown. 32. **GSChannelRelease** *type enum* - GS_CHANNEL_RELEASE (= 32) + GS_CHANNEL_RELEASE *purpose* Tell the Shepherd(s) that Global Services is exiting and will @@ -2432,7 +2432,7 @@ for example ones related to sequencing runtime startup and teardown. 33. **GSHalted** *type enum* - GS_HALTED (= 33) + GS_HALTED *purpose* Notify Launcher that Global Services is halted. The Global @@ -2450,7 +2450,7 @@ for example ones related to sequencing runtime startup and teardown. 55. **GSTeardown** *type enum* - GS_TEARDOWN (= 55) + GS_TEARDOWN *purpose* Direct Global Services to do a clean exit - clean up any remaining @@ -2468,7 +2468,7 @@ for example ones related to sequencing runtime startup and teardown. 65. **GSPingProc** *type enum* - GS_PING_PROC (= 65) + GS_PING_PROC *purpose* When a new managed process wanting to use Global Services @@ -2503,10 +2503,10 @@ for example ones related to sequencing runtime startup and teardown. .. _gsdump: -66. **GSDump** +66. **GSDumpState** *type enum* - GS_DUMP (= 66) + GS_DUMP_STATE *purpose* Primarily debugging. Makes global services dump its state in @@ -2520,14 +2520,14 @@ for example ones related to sequencing runtime startup and teardown. - string - file to open and write the dump to - *implementation(s):* :func:`Python` + *implementation(s):* :func:`Python` .. _gsunexpected: 85. **GSUnexpected** *type enum* - GS_UNEXPECTED (= 85) + GS_UNEXPECTED *purpose* Whenever GS gets a message that is not expected as an input to GS @@ -2558,7 +2558,7 @@ for example ones related to sequencing runtime startup and teardown. 27. **ExceptionlessAbort** *type enum* - EXCEPTIONLESS_ABORT (= 115) + EXCEPTIONLESS_ABORT *purpose* It can be benefical to pass a message through the services runtime that @@ -2593,7 +2593,7 @@ LS Process Messages 34. **SHProcessCreate** *type enum* - SH_PROCESS_CREATE (= 34) + SH_PROCESS_CREATE *purpose* Request to Shepherd to launch a process locally. @@ -2648,7 +2648,7 @@ LS Process Messages 35. **SHProcessCreateResponse** *type enum* - SH_PROCESS_CREATE_RESPONSE (= 35) + SH_PROCESS_CREATE_RESPONSE *purpose* Response to process creation request. @@ -2677,7 +2677,7 @@ LS Process Messages 36. **SHProcessKill** *type enum* - SH_PROCESS_KILL (= 36) + SH_PROCESS_KILL *purpose* Request to kill a process owned by this shepherd, with @@ -2703,13 +2703,72 @@ LS Process Messages *implementation(s):* :func:`Python` +.. _shmultiprocesscreate: -.. _shprocesskillresponse: +138. **SHMultiProcessCreate** + + *type enum* + SH_MULTI_PROCESS_CREATE + + *purpose* + Request to Shepherd to launch multiple processes locally. + + *fields* + **pmi_group_info** + - Optional PMIGroupInfo structure. + - Contains common PMI/MPI values needed to start the requested PMI enabled + applications, including the nid_list, host_list. + **procs** + - list of SHProcessCreate messages representing the processes + to be started. + + *response* + SHMultiProcessCreateResponse + + *see also* + SHProcessKill + + refer to the :ref:`cfs` section for additional request message fields + + *implementation(s):* :func:`Python` + +.. _shmultiprocesscreateresponse: + +139. **SHMultiProcessCreateResponse** + + *type enum* + SH_MULTI_PROCESS_CREATE_RESPONSE + + *purpose* + Response to multi process creation request. + + *fields* + Alternatives on ``err``: + + SUCCESS (= 0) + **responses** + - list of SHProcessCreateResponse messages for each previously + requested process. + + FAIL (= 1) + **err_info** + - string + - what went wrong + + *request* + SHMultiProcessCreate + + *see also* + SHProcessKill + + *implementation(s):* :func:`Python` + + .. _shprocesskillresponse: 93. **SHProcessKillResponse** *type enum* - SH_PROCESS_KILL_RESPONSE (= 93) + SH_PROCESS_KILL_RESPONSE *purpose* Response to request to kill a process owned by this shepherd. @@ -2744,7 +2803,7 @@ LS Process Messages 37. **SHProcessExit** *type enum* - SH_PROCESS_EXIT (= 37) + SH_PROCESS_EXIT *purpose* This message is sent to Global Services when a managed process exits. @@ -2778,7 +2837,7 @@ OS resources. 79. **SHPoolCreate** *type enum* - SH_POOL_CREATE (= 79) + SH_POOL_CREATE *purpose* Create a new memory pool. @@ -2817,7 +2876,7 @@ OS resources. 80. **SHPoolCreateResponse** *type enum* - SH_POOL_CREATE_RESPONSE (= 80) + SH_POOL_CREATE_RESPONSE *purpose* Response to request to create a new memory pool. @@ -2854,7 +2913,7 @@ OS resources. 81. **SHPoolDestroy** *type enum* - SH_POOL_DESTROY (= 81) + SH_POOL_DESTROY *purpose* Request to destroy a memory pool. @@ -2877,7 +2936,7 @@ OS resources. 82. **SHPoolDestroyResponse** *type enum* - SH_POOL_DESTROY_RESPONSE (= 82) + SH_POOL_DESTROY_RESPONSE *purpose* Response to request to destroy a memory pool @@ -2909,7 +2968,7 @@ OS resources. 83. **SHExecMemRequest** *type enum* - SH_EXEC_MEM_REQUEST (= 83) + SH_EXEC_MEM_REQUEST *purpose* Request to execute a memory request. This message contains a serialized @@ -2939,7 +2998,7 @@ OS resources. 84. **SHExecMemResponse** *type enum* - SH_EXEC_MEM_RESPONSE (= 84) + SH_EXEC_MEM_RESPONSE *purpose* Response to request to execute a memory request. Note that there is no @@ -2984,7 +3043,7 @@ Channel Messages 38. **SHChannelCreate** *type enum* - SH_CHANNEL_CREATE (= 38) + SH_CHANNEL_CREATE *purpose* Request to create a channel in a memory pool known to this shepherd. @@ -3020,7 +3079,7 @@ Channel Messages 39. **SHChannelCreateResponse** *type enum* - SH_CHANNEL_CREATE_RESPONSE (= 39) + SH_CHANNEL_CREATE_RESPONSE *purpose* Response to channel allocation request. @@ -3049,7 +3108,7 @@ Channel Messages 40. **SHChannelDestroy** *type enum* - SH_CHANNEL_DESTROY (= 40) + SH_CHANNEL_DESTROY *purpose* Request to free a previously allocated channel. @@ -3072,7 +3131,7 @@ Channel Messages 41. **SHChannelDestroyResponse** *type enum* - SH_CHANNEL_DESTROY_RESPONSE (= 41) + SH_CHANNEL_DESTROY_RESPONSE *purpose* Response to request to free a previously allocated channel. @@ -3101,7 +3160,7 @@ Channel Messages PLACEHOLDER *type enum* - SH_LOCK_CHANNEL (= 42) + SH_LOCK_CHANNEL *purpose* Request to lock a channel @@ -3133,7 +3192,7 @@ Channel Messages PLACEHOLDER *type enum* - SH_LOCK_CHANNEL_RESPONSE (= 43) + SH_LOCK_CHANNEL_RESPONSE *purpose* Response to request to lock a channel @@ -3165,7 +3224,7 @@ Memory Allocation Messages PLACEHOLDER - maybe OBE *type enum* - SH_ALLOC_MSG (= 44) + SH_ALLOC_MSG *purpose* Request a shared memory allocation for a large message @@ -3188,7 +3247,7 @@ Memory Allocation Messages PLACEHOLDER - OBE? *type enum* - SH_ALLOC_MSG_RESPONSE (= 45) + SH_ALLOC_MSG_RESPONSE *purpose* Response to a requested allocation for a large message @@ -3215,7 +3274,7 @@ Memory Allocation Messages *type enum* - SH_ALLOC_BLOCK (= 46) + SH_ALLOC_BLOCK *purpose* Request a shared memory allocation for generic memory @@ -3250,7 +3309,7 @@ Memory Allocation Messages PLACEHOLDER - OBE? *type enum* - SH_ALLOC_BLOCK_RESPONSE (= 47) + SH_ALLOC_BLOCK_RESPONSE *purpose* Response to a requested allocation for generic memory @@ -3311,7 +3370,7 @@ value 0 in the single node case. 49. **SHChannelsUp** *type enum* - SH_CHANNELS_UP (= 49) + SH_CHANNELS_UP *purpose* Notify Launcher that this Shepherd has allocated the shared @@ -3342,7 +3401,7 @@ value 0 in the single node case. 50. **SHPingGS** *type enum* - SH_PING_GS (= 50) + SH_PING_GS *purpose* Acknowledge to Global Services that this Shepherd is up and @@ -3368,7 +3427,7 @@ value 0 in the single node case. 51. **SHHalted** *type enum* - SH_HALTED (= 51) + SH_HALTED *purpose* Notify launcher that this Shepherd is halted. @@ -3387,7 +3446,7 @@ value 0 in the single node case. 56. **SHTeardown** *type enum* - SH_TEARDOWN (= 56) + SH_TEARDOWN *purpose* Direct Shepherd to do a clean teardown. @@ -3403,7 +3462,7 @@ value 0 in the single node case. 57. **SHPingBE** *type enum* - SH_PING_BE (= 57) + SH_PING_BE *purpose* Shepherd handshake with MRNet backend @@ -3437,7 +3496,7 @@ value 0 in the single node case. 62. **SHHaltBE** *type enum* - SH_HALT_BE (= 62) + SH_HALT_BE *purpose* Shepherd telling the MRNet backend to exit. @@ -3452,7 +3511,7 @@ value 0 in the single node case. 52. **SHFwdInput** *type enum* - SH_FWD_INPUT (= 52) + SH_FWD_INPUT *purpose* Message carrying data intended to be written into the @@ -3486,7 +3545,7 @@ value 0 in the single node case. 53. **SHFwdInputErr** *type enum* - SH_FWD_INPUT (= 53) + SH_FWD_INPUT *purpose* Error response to a forward input message. This message @@ -3530,7 +3589,7 @@ value 0 in the single node case. 54. **SHFwdOutput** *type enum* - SH_FWD_OUTPUT (= 54) + SH_FWD_OUTPUT *purpose* Message carrying data from either stdout or stderr of a process @@ -3570,7 +3629,7 @@ value 0 in the single node case. 60. **SHHaltTA** *type enum* - SH_HALT_TA (= 60) + SH_HALT_TA *purpose* Message coming from Launcher to the Shepherd, telling it to tell TA to halt. @@ -3586,7 +3645,7 @@ value 0 in the single node case. 67. **SHDumpState** *type enum* - SH_DUMP (= 67) + SH_DUMP_STATE *purpose* Primarily debugging. Makes the Shepherd dump its state in @@ -3615,7 +3674,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi *type enum* - LA_BROADCAST (= 68) + LA_BROADCAST *purpose* @@ -3645,7 +3704,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi *type enum* - LA_PASSTHRU_FB (= 69) + LA_PASS_THRU_FB *purpose* @@ -3684,7 +3743,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi *type enum* - LA_PASSTHRU_BF (= 70) + LA_PASS_THRU_BF *purpose* @@ -3716,7 +3775,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi *type enum* - LA_SERVER_MODE (= 86) + LA_SERVER_MODE *purpose* @@ -3755,7 +3814,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi *type enum* - LA_SERVER_MODE_EXIT (= 87) + LA_SERVER_MODE_EXIT *purpose* @@ -3775,7 +3834,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi 74. **LAProcessDict** *type enum* - LA_PROCESS_DICT (= 88) + LA_PROCESS_DICT *purpose* Return a dictionary of process information for all the processes @@ -3797,7 +3856,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi 75. **LAProcessDictResponse** *type enum* - LA_PROCESS_DICT_RESPONSE (= 89) + LA_PROCESS_DICT_RESPONSE *purpose* Responds with a dictionary for all the processes @@ -3820,7 +3879,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi 76. **LADumpState** *type enum* - LA_DUMP (= 90) + LA_DUMP_STATE *purpose* Primarily debugging. Makes the Launcher dump its state in @@ -3840,7 +3899,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi 77. **LANodeIdxSH** *type enum* - LA_NODEINDEX_SH (= 91) + LA_NODEINDEX_SH *purpose* Communicates the node index from the Launcher Back End to the @@ -3860,7 +3919,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi 78. **LAChannelsInfo** *type_enum* - LA_CHANNELS_INFO (=92) + LA_CHANNELS_INFO *purpose* Broadcast to all nodes to provide hostnames, node indices, @@ -3900,7 +3959,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi 94. **Breakpoint** *type_enum* - BREAKPOINT (=94) + BREAKPOINT *purpose* Inform front end that a managed process has reached a breakpoint for the first time @@ -3938,7 +3997,7 @@ the launcher in both regular and server mode. They are communicated through the 106. **BEIsUp** *type enum* - BE_IS_UP (= 106) + BE_IS_UP *purpose* Confirm to Launcher Frontend that the Backend is up and send its serialized @@ -3964,7 +4023,7 @@ the launcher in both regular and server mode. They are communicated through the 58. **BEPingSH** *type enum* - BE_PING_SH (= 58) + BE_PING_SH *purpose* MRNet backend handshake with Shepherd @@ -3979,7 +4038,7 @@ the launcher in both regular and server mode. They are communicated through the 63. **BEHalted** *type enum* - BE_HALTED (= 63) + BE_HALTED *purpose* Indicate that the MRNet backend instance on this node has exited normally. @@ -3995,7 +4054,7 @@ the launcher in both regular and server mode. They are communicated through the 91. **BENodeIdxSH** *type enum* - BE_NODEINDEX_SH (= 91) + BE_NODE_IDX_SH *purpose* @@ -4042,7 +4101,7 @@ the launcher in both regular and server mode. 107. **FENodeIdxBE** *type enum* - FE_NODE_IDX_BE (= 107) + FE_NODE_IDX_BE *purpose* The Frontend sends the node index to the Backend, based on the backend's host_id. @@ -4061,7 +4120,7 @@ the launcher in both regular and server mode. 108. **HaltOverlay** *type enum* - HALT_OVERLAY (= 108) + HALT_OVERLAY *purpose* Indicate that monitoring of the overlay network should cease @@ -4076,7 +4135,7 @@ the launcher in both regular and server mode. 109. **HaltLoggingInfra** *type enum* - HALT_LOGGING_INFRA (= 109) + HALT_LOGGING_INFRA *purpose* Indicate that monitoring of logging messages from the backend should cease @@ -4092,7 +4151,7 @@ the launcher in both regular and server mode. 116. **LAExit** *type enum* - LA_EXIT (= 116) + LA_EXIT *purpose* Indicate the launcher should exit. Use in case the launcher teardown was unable to @@ -4120,7 +4179,7 @@ logged to file or terminal. 97. **LoggingMsg** *type enum* - LOGGING_MSG (= 97) + LOGGING_MSG *purpose* To take logging strings provided via python logging (``eg: log.info('message')``) @@ -4175,7 +4234,7 @@ logged to file or terminal. 98. **LoggingMsgList** *type enum* - LOGGING_MSG_LIST (= 98) + LOGGING_MSG_LIST *purpose* Takes a list of :ref:`LoggingMsg ` and aggregates them into a single @@ -4193,7 +4252,7 @@ logged to file or terminal. 99. **LogFlushed** *type enum* - LOG_FLUSHED (= 99) + LOG_FLUSHED *purpose* Sent by MRNet server backend to frontend after it has completed its final @@ -4219,7 +4278,7 @@ related to setup and teardown control through the *TA Channel*. 59. **TAPingSH** *type enum* - TA_PING_SH (= 59) + TA_PING_SH *purpose* Indicate that the TA instance on this node has come up and @@ -4236,7 +4295,7 @@ related to setup and teardown control through the *TA Channel*. 61. **TAHalted** *type enum* - TA_HALTED (= 61) + TA_HALTED *purpose* Indicate that the TA instance on this node has exited normally. @@ -4253,7 +4312,7 @@ related to setup and teardown control through the *TA Channel*. 64. **TAUp** *type enum* - TA_UP (= 64) + TA_UP *purpose* Indicate that the TA instance on this node is up and ready. @@ -4268,7 +4327,7 @@ related to setup and teardown control through the *TA Channel*. 109. **OverlayPingBE** *type enum* - OVERLAY_PING_BE (= 109) + OVERLAY_PING_BE *purpose* Indicate that the Overlay instance on this backend node is up and ready. @@ -4283,7 +4342,7 @@ related to setup and teardown control through the *TA Channel*. 110. **OverlayPingLA** *type enum* - OVERLAY_PING_LA (= 110) + OVERLAY_PING_LA *purpose* Indicate that the Overlay instance on this frontend node is up and ready. @@ -4299,7 +4358,7 @@ related to setup and teardown control through the *TA Channel*. 112. **LAHaltOverlay** *type enum* - LA_HALT_OVERLAY (= 112) + LA_HALT_OVERLAY *purpose*] Launcher frontend requests a shutdown of Overlay agent @@ -4314,7 +4373,7 @@ related to setup and teardown control through the *TA Channel*. 113. **BEHaltOverlay** *type enum* - BE_HALT_OVERLAY (= 113) + BE_HALT_OVERLAY *purpose*] This backend node requests a shutdown of its Overlay agent @@ -4329,7 +4388,7 @@ related to setup and teardown control through the *TA Channel*. 114. **OverlayHalted** *type enum* - OVERLAY_HALTED (= 114) + OVERLAY_HALTED *purpose*] This overlay instance has shutdown diff --git a/doc/infrastructure/multi_node_deployment.rst b/doc/infrastructure/multi_node_deployment.rst index 26f0246..8a1bbb2 100644 --- a/doc/infrastructure/multi_node_deployment.rst +++ b/doc/infrastructure/multi_node_deployment.rst @@ -15,13 +15,13 @@ work of communicating off node when necessary. :numref:`deploy-multi-node` and : multi-node version of the Dragon :ref:`Services`. .. figure:: images/deployment_multi_node.svg - :name: deploy-multi-node + :name: deploy-multi-node **Startup Overview** .. figure:: images/multinodeoverview.png :scale: 30% - :name: multi-node-overview + :name: multi-node-overview **Multi-Node Overview of Dragon Services** diff --git a/doc/infrastructure/processes.rst b/doc/infrastructure/processes.rst index 335133a..1352268 100644 --- a/doc/infrastructure/processes.rst +++ b/doc/infrastructure/processes.rst @@ -190,7 +190,7 @@ further describes those activities. .. figure:: images/launchproc.srms1.png :scale: 75% - :name: launchproc + :name: launchproc **Launcher Component Interaction during Process Interaction** diff --git a/doc/infrastructure/single_node_deployment.rst b/doc/infrastructure/single_node_deployment.rst index f384b80..dec7243 100644 --- a/doc/infrastructure/single_node_deployment.rst +++ b/doc/infrastructure/single_node_deployment.rst @@ -13,12 +13,12 @@ user application decides to directly spawn processes itself, it retains the resp resources they use. .. figure:: images/deployment_single_node.svg - :name: deploy-single-node + :name: deploy-single-node **Deployment diagram a single node** .. figure:: images/singlenodeoverview.png - :name: singlenode-overview + :name: singlenode-overview **Single-Node Overview of Dragon Services** @@ -41,7 +41,7 @@ Single Node Bringup =================== .. figure:: images/startup_seq_single_node.svg - :name: startup-seq-single-node + :name: startup-seq-single-node **Startup Sequence on a single node** @@ -661,7 +661,7 @@ the teardown of the Dragon Services. .. figure:: images/single_teardown.srms1.png :scale: 75% - :name: teardown-seq-single-node + :name: teardown-seq-single-node **Single-Node Teardown Sequence** diff --git a/doc/pguide/owner.rst b/doc/pguide/owner.rst index f4ac91f..1122c64 100644 --- a/doc/pguide/owner.rst +++ b/doc/pguide/owner.rst @@ -29,17 +29,16 @@ in the Dragon Native API. Processes ========= -With version 0.5 Dragon only supports round robin placement of processes. Each new process will be placed on the next -node that Dragon is running on. Future releases of Dragon will allow for two levels of control that improve upon the -current behavior. The first will be the ability to control the default policy, and the second will be explicit -placement control for each process. +Dragon, by default, does a round robin placement of processes. Each new process will be placed on the next +node that Dragon is running on. Dragon also allows for two levels of control that improve upon the +current behavior. The first is the ability to control the default policy, and the second is explicit placement control for each process. Round Robin Placement --------------------- .. figure:: images/roundrobin.svg :scale: 75% - :name: roundrobin + :name: roundrobin **UML deployment diagram of round robin placement with 4 processes on a distributed system with 3 compute nodes and one login node** @@ -47,7 +46,6 @@ Round Robin Placement policy. Any two processes started consecutively will be placed on unique nodes unless there is only a single node within your Slurm allocation. - Managed Memory ============== diff --git a/doc/pguide/stack.rst b/doc/pguide/stack.rst index 72f120e..5b7f4ff 100644 --- a/doc/pguide/stack.rst +++ b/doc/pguide/stack.rst @@ -2,17 +2,17 @@ The API Stack +++++++++++++ Before you start programming with Dragon, you need to decide which API you want -to program to. The runtime provides a stack of interfaces abstracting +to program to. The runtime provides a stack of interfaces abstracting :term:`resources ` of a distributed system, ranging from low-level shared memory to a distributed dictionary. It is *composable*, meaning the APIs are built on top of each other (see :numref:`dragon-api-stack`). We deliberately expose the whole stack of APIs so that you can choose if you want to interact with the complete -runtime or want to use only parts of it. For an overview, see table 1 in the +runtime or want to use only parts of it. For an overview, see table 1 in the :ref:`reference section `. .. figure:: images/dragon_api_stack.png :scale: 25% - :name: dragon-api-stack + :name: dragon-api-stack **The Dragon API stack** @@ -20,14 +20,14 @@ runtime or want to use only parts of it. For an overview, see table 1 in the Lower level interfaces yield less convenient objects. Thus new and experienced users should consider programming to Dragon in two ways: -1. In Python using :ref:`pguide/dragon_multiprocessing:Multiprocessing with Dragon`, if they +1. In Python using :ref:`pguide/dragon_multiprocessing:Multiprocessing with Dragon`, if they want to make an existing Python code scale to a distributed system quickly. -2. In C, C++, Fortran and Python using the :ref:`ref/native/index:Dragon Native` API, if they want to take advantage +2. In C, C++, Fortran and Python using the :ref:`ref/native/index:Dragon Native` API, if they want to take advantage of all Dragon features or need to use languages other than Python. The objects provided by these two APIs have the following properties: -* **interoperable**: a named Python Multiprocessing Queue object can be used as a managed Dragon Native Queue in C with the same name. +* **interoperable**: a named Python Multiprocessing Queue object can be used as a managed Dragon Native Queue in C with the same name. * **transparent**: :term:`objects ` can be used everywhere on a system of distributed or even federated nodes. * **shareable**: objects can be serialized and passed to other programs, processes or threads via stdin. * **managed**: :term:`objects ` can be looked up by :term:`name ` or :term:`uid ` to retrieve their :term:`serialized descriptor `. @@ -40,7 +40,7 @@ programming languages. In the future, experienced developers can further program to the -3. :term:`Unmanaged ` Dragon Native API, if they want to use composite objects with improved performance. See :ref:`uguide/resource_model:Performance Costs`. +3. :term:`Unmanaged ` Dragon Native API, if they want to use composite objects with improved performance. See :ref:`uguide/resource_model:Performance Costs`. 4. Dragon Client API or Dragon Infrastructure API, if they want to extend the functionality of the Dragon runtime by extending Dragon Services. 5. Dragon Core API, to use core functionality in their own programs without starting the runtime. To use the Dragon core API on its own, see also :ref:`pguide/dragon_multiprocessing:Multiprocessing and Dragon without Patching` @@ -51,8 +51,8 @@ Architecture of a Dragon Program .. figure:: images/api_use_python.svg :scale: 75% - :name: api-use-python - + :name: api-use-python + **Architecture of a user program using Dragon with Python Multiprocessing or Dragon Native. Internal Dragon APIs are not shown.** In :numref:`api-use-python` we show a component diagram of the architecture of a Dragon program @@ -65,7 +65,7 @@ API. Dragon services manage the primary objects and communicate using the infrastructure message component on top of the core API. -* Python Multiprocessing with Dragon programs only use the Multiprocessing API. Our +* Python Multiprocessing with Dragon programs only use the Multiprocessing API. Our MPBridge component translates the Multiprocessing objects into Dragon native objects by heavily modifying the object APIs. This way we achieve limited interoperability between both APIs. @@ -75,7 +75,7 @@ Architecture of Advanced Use Cases .. figure:: images/api_use_core.svg :scale: 75% - :name: api-use-core + :name: api-use-core **Architecture of advanced use cases for the Dragon runtime. Internal APIs are not shown.** @@ -83,11 +83,11 @@ In :numref:`api-use-core` we show a component diagram of the architecture of adv Note that these use cases are not supported yet. * User programs using :term:`unmanaged ` Dragon native objects directly call into - Dragon, but do not require the infrastructure services to track :term:`names ` and + Dragon, but do not require the infrastructure services to track :term:`names ` and :term:`uids ` of their objects. This reduces the load on infrastructure services, which only provide :term:`transparency ` across :term:`distributed or federated systems `. -* Users may choose to extend the Dragon native API with their own composite objects, using Dragons +* Users may choose to extend the Dragon native API with their own composite objects, using Dragons native, client and core APIs for maximum flexibility. * User may want to use only the Dragon core components to extend their own programs with its components. In that case the infrastructure components of Dragon do not need to be started, Dragon core components can be - :ref:`directly imported and used `. \ No newline at end of file + :ref:`directly imported and used `. \ No newline at end of file diff --git a/doc/ref/client/index.rst b/doc/ref/client/index.rst index 481dac6..708a5b9 100644 --- a/doc/ref/client/index.rst +++ b/doc/ref/client/index.rst @@ -33,7 +33,7 @@ Architecture .. figure:: images/client_architecture.svg :scale: 75% - :name: GS-client-architecture + :name: GS-client-architecture **GS Client architecture** diff --git a/doc/ref/core/Cython/channels.rst b/doc/ref/core/Cython/channels.rst index cffc943..584d9a0 100644 --- a/doc/ref/core/Cython/channels.rst +++ b/doc/ref/core/Cython/channels.rst @@ -1,7 +1,7 @@ .. _DragonCoreChannelsCython: Channels -++++++++ ++++++++++++++ This is the Dragon channels interface for Python @@ -10,11 +10,11 @@ This is the Dragon channels interface for Python :local: :backlinks: entry -Classes +Classes ======= .. automodule:: dragon.channels - :members: Message, ChannelSendH, ChannelRecvH, Channel, Peer2PeerReadingChannelFile, Many2ManyReadingChannelFile, Many2ManyWritingChannelFile, GatewayMessage + :members: Message, ChannelSendH, ChannelRecvH, Channel, Peer2PeerReadingChannelFile, Many2ManyReadingChannelFile, Many2ManyWritingChannelFile, GatewayMessage Functions ========= diff --git a/doc/ref/core/Cython/fli.rst b/doc/ref/core/Cython/fli.rst new file mode 100644 index 0000000..e1556f2 --- /dev/null +++ b/doc/ref/core/Cython/fli.rst @@ -0,0 +1,27 @@ +.. _DragonCoreFLICython: + +File Like Interface ++++++++++++++++++++++++++ + +This is the FLI API for Python. The classes presented here are a thin wrapper of the C API. The C language description of the :ref:`DragonFileLikeInterface` +provides a detailed description of the FLI code and should be consulted for a good overview of this functionality. This section provides the +description of the Python interface to this C code. + +.. contents:: + :depth: 3 + :local: + :backlinks: entry + +Classes +======= + +.. automodule:: dragon.fli + :members: FLInterface, FLISendH, FLIRecvH, DragonFLIError, FLIEOT + +Exceptions +========== + +.. automodule:: dragon.fli + :members: DragonFLIError, FLIEOT + + diff --git a/doc/ref/core/Cython/managed_memory.rst b/doc/ref/core/Cython/managed_memory.rst index 2fabe15..f5e1386 100644 --- a/doc/ref/core/Cython/managed_memory.rst +++ b/doc/ref/core/Cython/managed_memory.rst @@ -1,7 +1,7 @@ .. _DragonCoreManagedMemoryCython: Managed Memory -++++++++++++++ ++++++++++++++++++++ This is the Dragon managed memory interface for Python @@ -10,7 +10,7 @@ This is the Dragon managed memory interface for Python :local: :backlinks: entry -Classes +Classes ======= .. automodule:: dragon.managed_memory @@ -20,14 +20,14 @@ Classes .. ========= .. .. automodule:: dragon.managed_memory -.. :members: +.. :members: Enums ===== .. automodule:: dragon.managed_memory - :members: PoolType, AllocType, - + :members: PoolType, AllocType, + Exceptions ========== diff --git a/doc/ref/core/c/channels.rst b/doc/ref/core/c/channels.rst index a705023..59ac455 100644 --- a/doc/ref/core/c/channels.rst +++ b/doc/ref/core/c/channels.rst @@ -1,74 +1,6 @@ .. _DragonCoreChannels: Channels -++++++++ - -.. contents:: Table of Contents - :local: - -Description -=========== - -Dragon Channels is the low-level transport layer for communicating messages -between POSIX processes in the Dragon runtime. Channels are a thread and -interprocess-safe queue where messages can be sent and received. The Dragon -run-time services itself uses channels for communication between processes. User -programs, utilizing the Dragon run-time services also use channels either -directly, or indirectly. This API would be used directly when a program wants to -communicate in the most efficient way possible while being willing to give up the -services provided by higher level communication and synchronization abstractions. -Choosing to communicate at the channels level will mean giving up features like -pickling and unpickling, automatic serialization of data, and the automatic -streaming of data over a channel. - -Channels provide flexible on-node and communication that processes use by -attaching and detaching to the underlying -:ref:`Managed Memory`. When Channels are used in conjunction with a -transport agent, transparent off-node communication is also provided when sending -and receiving messages using the Channels API. In this respect, Channels resemble -POSIX sockets as they are always available as a service and not built upon the -static distributed model that MPI or SHMEM is. :term:`Transparency` is provided because -the exact same Channels API calls work for both on-node and off-node -communication. The user program does not change when communicating off-node or -on-node. - -A major advantage of Channels is that they retain the flexibility of using -sockets or a filesystem while enabling zero-copy on-node transfers, single-copy -RDMA-enabled transfers off-node, and choices for blocking semantics. There is a -rich set of buffer management options that enable use cases such as: - - - static target-side managed payload landing pads much like SHMEM or - one-sided MPI - - - dynamic target-side managed payload landing pads much like two-sided MPI - - - static or dynamic origin-side managed payload landing pad, which nothing - else has - -Dragon Channels can reside in any valid :c:struct:`dragonMemoryDescr_t` as -provided by Dragon Memory Pools. This includes shared memory, a filesystem, or -private virtual memory. Payload buffers for messages can reside within a -channel, a memory pool :c:struct:`dragonMemoryPoolDescr_t` the channel was -allocated from, or any valid :c:struct:`dragonMemoryDescr_t` passed with a -message. This design allows Channels to provide multiple usage scenarios with -different performance and persistence characteristics. - -Channels provide both blocking an non-blocking semantics for sending and getting -messages. When sending, blocking can be chosen to wait when memory from a pool is -needed and not available. When getting messages, blocking receives will wait -until a message is available. Blocking operations can either idle wait, consuming -fewer resources and energy, or spin wait, with relatively no wakeup cycles. -Channels are highly configurable and customizable for whatever situation they are -used in at the expense of being a low-level primitive synchronization and -communication construct. - -Example -========== - -:ref:`An example can be found here `. The example illustrates how -to use the C Channels API. - -Channels API ============== Constants diff --git a/doc/ref/core/c/channelsets.rst b/doc/ref/core/c/channelsets.rst index 7b12516..5426d8d 100644 --- a/doc/ref/core/c/channelsets.rst +++ b/doc/ref/core/c/channelsets.rst @@ -1,7 +1,7 @@ .. _ChannelSets: ChannelSets -+++++++++++++++++++ +=============== This is a placeholder for future doc work on ChannelSets. See the Channels documentation for an example with more detail. @@ -9,16 +9,16 @@ more detail. .. _ChannelSetAPI: Description -============== +''''''''''''''' TBD. Example -========== +''''''''''' TBD. API -===== +''''''' These are the user-facing API calls for ChannelSet objects. diff --git a/doc/ref/core/c/fli.rst b/doc/ref/core/c/fli.rst new file mode 100644 index 0000000..ba79327 --- /dev/null +++ b/doc/ref/core/c/fli.rst @@ -0,0 +1,39 @@ +.. _DragonFileLikeInterface: + +File Like Interface +==================== + +Constants +'''''''''''''''''''''''''''' + +.. doxygengroup:: fli_consts + :content-only: + :members: + +Structures +'''''''''''''''' + +.. doxygengroup:: fli_structs + :content-only: + :members: + +FLI Lifecycle Management +''''''''''''''''''''''''''''' + +.. doxygengroup:: fli_lifecycle + :content-only: + :members: + +FLI Send/Recv Handle Management +'''''''''''''''''''''''''''''''''''' + +.. doxygengroup:: fli_handles + :content-only: + :members: + +FLI Send/Recv Functions +''''''''''''''''''''''''''''' + +.. doxygengroup:: fli_sendrecv + :content-only: + :members: diff --git a/doc/ref/core/images/channel.png b/doc/ref/core/images/channel.png new file mode 100644 index 0000000..4de7c01 Binary files /dev/null and b/doc/ref/core/images/channel.png differ diff --git a/doc/ref/core/images/fli_main.png b/doc/ref/core/images/fli_main.png new file mode 100644 index 0000000..3dc908b Binary files /dev/null and b/doc/ref/core/images/fli_main.png differ diff --git a/doc/ref/core/images/fli_receiver.png b/doc/ref/core/images/fli_receiver.png new file mode 100644 index 0000000..32d6115 Binary files /dev/null and b/doc/ref/core/images/fli_receiver.png differ diff --git a/doc/ref/core/images/fli_sender.png b/doc/ref/core/images/fli_sender.png new file mode 100644 index 0000000..6b06adf Binary files /dev/null and b/doc/ref/core/images/fli_sender.png differ diff --git a/doc/ref/core/images/managed_memory.png b/doc/ref/core/images/managed_memory.png new file mode 100644 index 0000000..082fd2f Binary files /dev/null and b/doc/ref/core/images/managed_memory.png differ diff --git a/doc/ref/core/index.rst b/doc/ref/core/index.rst index db27728..da5d6b6 100644 --- a/doc/ref/core/index.rst +++ b/doc/ref/core/index.rst @@ -1,10 +1,7 @@ .. _DragonCore: Dragon Core -+++++++++++ - - ++++++++++++++++++++++ The Dragon Core API supports much of the rest of the Dragon run-time services. It can also be programmed to directly by knowledgable programmers. The API has at @@ -13,14 +10,574 @@ C, the API is interoperable with other languages, including C, C++, Python, and Fortran (C++ and Fortran bindings are currently not available as of this writing). -Shared memory allocations come from memory pools. Both memory pools and memory -pool allocations are part of the shared memory API. +The following sections provide an overview of the major components of the Dragon +Core. The sections following this introduction provide a detailed description of +the API and objects that are a part of the Dragon Core. + +.. _managed_mem_overview: + +Managed Memory +========================= + +.. figure:: images/managed_memory.png + :scale: 50% + :name: managed_mem_fig + + **Conceptual View of Managed Memory** + + +As depicted in :numref:`managed_mem_fig`, shared memory allocations come from +memory pools. Both memory pools and memory pool allocations are part of the +shared memory API. POSIX shared memory provides the underlying mechanism through +which managed memory is implemented. Managed memory pools consist of two parts, a +manifest and a data. The manifest is where all meta-data about the managed memory +pool is stored. The data part is where allocations are made. The data part is +managed by a heap manager whose state is maintained in the manifest. The data is +split up into segments which are the smallest allocatable block of data within +the data part. The size of the memory pool, size of segments, and number of +segments are all configurable at run-time. + +The heap manager, a sub-component of managed memory, allocates data from the pool +by allocating one or more contiguous segments that together are as big or bigger +than a requested allocation. Should the managed memory pool be full or unable to +make an allocation of a specific size, but may be able to in the future once +other allocations are freed, allocation requests may block the requesting process +until the allocation is available. Processes decide whether to use blocking +allocation calls or not and decide on the amount of time they are willing to +wait. + +Managed memory allocation is thread and interprocess safe. Interprocess locking +guarantees that critical sections are executed without interleaving between +processes. Managed memory is an interprocess and thread-safe dynamic memory +allocation framework. + +To share a managed memory allocation a serialized descriptor can be created and +shared. Other processes which receive a copy of the serialized descriptor can +then attach to the same shared memory within their own process. This concept of +serializing an object to be shared with other processes that can then attach to +it is widespread within Dragon. Managed memory, channels, and things that are +built from these fundamental building blocks all can be shared by creating a +serialized representation that can then be attached in other processes. + +Managed memory allocations can be attached, even off-node. However, memory +allocations are hosted in shared memory of a single node. Attaching to a managed +memory allocation is different than getting a pointer into the data. Managed +memory descriptors can be passed around at will in the Dragon run-time and +between Dragon managed processes. But, a process must be running on the node +where the pool exists to obtain a pointer into a managed memory allocation. + +When the Dragon run-time is started, two managed memory pools are created per node +to facilitate communication within the run-time. An infrastructure pool is created that +is used strictly by infrastructure supporting code internal to Dragon. A default pool +is also allocated which is used by APIs when making internal allocations on behalf of a +user. + +.. _channels_overview: + +Channels +================== + +As shown in :numref:`managed_mem_fig`, a channel, at its most fundamental level, +is an allocation in a managed memory pool. Channels only exist inside managed +memory pools. All channels are allocated out of some managed memory pool. +Internally, as shown at a very high level in :numref:`channel_pic` a channel is +composed of finite number of blocks. Both the number of blocks and the block size +are configurable at run-time. + +.. figure:: images/channel.png + :scale: 50% + :name: channel_pic + + **Conceptual View of a Channel's Structure** The channels API provides an organized means of synchronizing and communicating between processes. Channels provides a queue-like interface between these processes both on-node and off-node (when used in conjunction with a transport -service). +service). Channels are a thread and interprocess-safe queue where messages can be +sent and received. + +A channel, like managed memory, can be serialized and shared with other +processes. A channel, like managed memory, can also be attached both on-node and +off-node. The channels library itself recognizes when a channel is located +off-node and coordinates with a gateway channel which in turn is serviced by a +transport service to provide communication from a process that is located on a +node different from the channel with which it is communicating. + +Messages are sent into channels and are captured inside a channel block. Each +block either contains a message or contains a reference to a message if the +message is too big to be contained in the block. When a message is sent, an +unallocated block is allocated and added to the list of allocated blocks to be +dequeued in sequential order. When a block is unavailable, a process may wish to +block until a block is available to send its message. + +Channels provide both blocking and non-blocking semantics for sending and getting +messages. When sending, blocking can be chosen to wait when memory from a pool is +needed and not available. When getting messages, blocking receives will wait +until a message is available. All blocking operations within Dragon can either +idle wait, consuming fewer resources and energy, or spin wait, with relatively no +wakeup cycles. Channels are highly configurable and customizable for whatever +situation they are used in at the expense of being a low-level primitive +synchronization and communication construct. + +When a message is dequeued the relevant message payload is copied from the block +and its corresponding block is added to the list of unallocated blocks to be used +again. In this way a channel's blocks are re-used during its lifetime. + +During the enqueue operation (called send_msg in the API) message data is copied +into a block and during the dequeue operation (called get_msg in the API) message +data is copied out of the block. For large messages the contents of a block is a +serialized descriptor of the message and therefore the actual message contents +may not be copied but rather shared in a controlled manor between processes +providing the potential for a zero copy transfer of data between processes when +both processes are located on-node. + +Out of these basic concepts all other, higher level objects are constructed. Both +channels and managed memory also rely on even more primitive sub-components to do +their work. These sub-components provide locking, blocking, heap management, and +various other services that are needed by these two fundamental building blocks, but +managed memory and channels are the fundamental building blocks of all higher level +objects like the File Like Interface described in the next section. + +Further Information +''''''''''''''''''''''' + +Dragon Channels is the low-level transport layer for communicating messages +between POSIX processes in the Dragon runtime. The Dragon run-time services +itself uses channels for communication between processes. User programs, +utilizing the Dragon run-time services also use channels either directly, or +indirectly. This API would be used directly when a program wants to communicate +in the most efficient way possible while being willing to give up the services +provided by higher level communication and synchronization abstractions like FLIs +and even higher level abstractions like queues. Choosing to communicate at the +channels level will mean giving up features like pickling and unpickling, +automatic serialization of data, and the automatic streaming of data over a +channel. + +A major advantage of Channels is that they retain the flexibility of using +sockets or a filesystem while enabling zero-copy on-node transfers, single-copy +RDMA-enabled transfers off-node, and choices for blocking semantics. There is a +rich set of buffer management options that enable use cases such as: + + - static target-side managed payload landing pads much like SHMEM or + one-sided MPI + + - dynamic target-side managed payload landing pads much like two-sided MPI + + - static or dynamic origin-side managed payload landing pad, which nothing + else has + +Dragon Channels can reside in any valid :c:struct:`dragonMemoryDescr_t` as +provided by Dragon Memory Pools. This includes shared memory, a filesystem, or +private virtual memory. Payload buffers for messages can reside within a +channel, a memory pool :c:struct:`dragonMemoryPoolDescr_t` the channel was +allocated from, or any valid :c:struct:`dragonMemoryDescr_t` passed with a +message. This design allows Channels to provide multiple usage scenarios with +different performance and persistence characteristics. + +Example +''''''''''' + +:ref:`An example can be found here `. The example illustrates how +to use the C Channels API. + +Channels provide flexible on-node and communication that processes use by +attaching and detaching to the underlying +:ref:`Managed Memory`. When Channels are used in conjunction with a +transport agent, transparent off-node communication is also provided when sending +and receiving messages using the Channels API. In this respect, Channels resemble +POSIX sockets as they are always available as a service and not built upon the +static distributed model that MPI or SHMEM is. :term:`Transparency` is provided because +the exact same Channels API calls work for both on-node and off-node +communication. The user program does not change when communicating off-node or +on-node. + +.. _fli_overview: + +File Like Interface +============================== + +The File Like Interface (i.e. FLI) provides an abstraction over channels of 1:1 +communication between two end points. The abstraction supports streaming data +over a *connection* like interface. Sending and receiving data is done by opening +a send or receive handle. Once a send and a receive handle is opened, a 1:1 +connection exists between sender and receiver. Data can then be streamed over the +connection with a guarantee that data from other senders and receivers will not +interrrupt the stream between the two end points. + +There is no restriction about the location of the two end points. They may exist +in different processes on different nodes using channels that may be co-located +on the sending or receiving side, but are not required to be. There are certain +benefits to co-location, but no requirements for it in the FLI API. + +.. figure:: images/fli_main.png + :scale: 75% + :name: fli_main + + **File Like Interface Overview** + +The FLI is implemented as a channel of stream channels. A *Stream Channel* is a +channel that is designated for one sender and one receiver to share while carrying +on a streaming conversation. When the conversation is over, the stream channel is +recycled according to the FLI protocol. + +When an FLI is created, a *Main Channel* and a *Manager Channel* are created in +the nominal case. When the FLI is created, there are some number of stream +channels that are supplied. These stream channels are serialized and their +serialized descriptors are placed in the manager channel. The manager channel is +then a channel of channel descriptors. + +When a *Sender* comes along, it opens a send handle on the FLI which then +receives a serialized channel descriptor for a stream channel from the manager +channel. A sender looks for stream channels in the manager channel and attempts +to dequeue it when the send handle is opened. + +At the moment the send handle is opened, the serialized representation of it is +then written into the *Main Channel*. This means that as soon as there is a +sender, the stream channel becomes available to a receiver to begin receiving +streaming data. A sender and receiver can be carrying on their 1:1 conversation +as soon as a sender is opens a send handle. + +When a *Receiver* opens a receive handle, the FLI API receives a stream channel +from the main channel and adds it to the receive handle. The receiver can begin +receiving data on the stream channel at that point. + +When a sender closes a send handle a special message is sent to the receiving +side to indicate that the conversation is over. This results in the receiver +receiving a return code of DRAGON_EOT to indicate that the end of the +conversation has been reached. + +Once the receiver has received DRAGON_EOT it must respond by closing the receive +handle. When the receive handle is closed, the stream channel is recycled by +enqueuing a serialized descriptor for it back into the *Manager Channel*. + +Controlling the Placement of Data +''''''''''''''''''''''''''''''''''' + +One nuance of the FLI design can help in the placement of data within a pool. +When data is sent to a channel, either on-node or off-node, the data must be +copied into a managed memory pool while it is transferred through the stream +channel. + +In the absence of further information about a pool from which to allocate space, +the pool of the channel being sent to will be used. If the user application +wishes to optimally handle data by placing it in a pool and minimizing copies of +that data, then creating the stream channels from the same pool will mean that +the FLI will automatically place data that was sent to a stream channel into the +same pool as the stream channel and therefore into a pool chosen by the user. The +net result of this rather complicated explanation is that if the receiver creates +the FLI then it may be beneficial to first create a pool and stream channels from +that same pool to optimally minimize the copying of data. + +Example +''''''''''' + +Here is some sample code for creating an FLI using only the C interface. This +example code was taken from the test/channel_subtests/test_fli.c program. Note +that for this to be useful production code, resources should be created using a +means of guaranteeing unique CUIDs for channels and if needed, unique MUIDs for +pools. For instance, you might use the dragon_create_process_local_channel API +call to create guaranteed unique CUIDs for you channels. + +.. code-block:: C + :linenos: + :caption: **Creating an FLI** + + #include + #include + + #define M_UID 0 + #define POOL_M_UID 2 + #define POOL "fli_test" + #define NUM_CHANNELS 10 + #define MAX_STREAM_SIZE 500 + + dragonError_t create_pool(dragonMemoryPoolDescr_t* mpool) { + /* Create a memory pool to allocate messages and a Channel out of */ + size_t mem_size = 1UL<<31; + + dragonError_t err = dragon_memory_pool_create(mpool, mem_size, POOL, POOL_M_UID, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to create memory pool"); + + return DRAGON_SUCCESS; + } + + dragonError_t create_channels(dragonMemoryPoolDescr_t* mpool, dragonChannelDescr_t channel[], int arr_size) { + int k; + dragonError_t err; + + for (k=0;k`). FLIs enable safe 1:1 conversations between a sender +and a receiver. FLIs are configurable to allow safe 1:1 streaming conversations as +well where the sender may stream content to a receiver over multiple messages. +FLIs and streaming FLIs are used extensively in the design of the distributed +dictionary and some familiarity with the FLI API is assumed. Each component has +two FLIs as shown in the figure. In general a main FLI is used to service new +incoming requests and a response FLI is used to handle responses to requests made +by a component. In the case of a client, two response FLIs are created to handle +streaming and non-streaming responses. + +Every FLI is created of some set of channels. Each component within the +distributed dictionary is responsible for creating its own FLIs and channels. The +required channels are created using a new API, supported by Dragon Local Services +on the node where a component is located. The new API, a class method named +*make_process_local* is callable via the :ref:`Channels API `. + +When a distributed dictionary is created by a client, the API first creates an +Orchestrator which is told details about the size and configuration of the +distributed dictionary being created. The orchestrator starts by creating two FLI +interfaces, its main FLI and its response FLI. It publishes its main FLI to the +client that created it by printing the serialized descriptor to the FLI to +standard out of the Orchestrator. + +The Orchestrator creates the specified number of Managers, providing each with the +orchestrator response FLI and its main FLI. The orchestrator receives registration +requests by each manager and responds to each manager on the manager's resposne FLI. + +Checkpointing +_________________ + +*CAVEAT - Checkpointing is not presently implemented. This section is presently +a design document.* + +When using a distributed dictionary it may be desirable to checkpoint operations +within the dictionary. Checkpointing is designed to minimize the interaction +between clients and managers to allow for scaling the dictionary to the sizes +needed to support large-scale supercomputing. + +In addition to checkpointing computation, persisting a checkpoint to disk is also +introduced, which is likely needed at scale. This is covered in more detail in +the next section. + +To implement checkpointing we want to avoid a global synchronization point to +achieve the maximum amount of parallelization and minimum impact to processes. +This means we need a protocol that allows some processes to work with data from a +new checkpoint while others continue to work with an older copy. Inherent in this +design is that all processes work on data that changes over time and there are +discrete moments where the data is updated. + +To implement checkpointing for these kinds of applications we introduce the +concept of a working set. A working set is a set of checkpoints that all reside +in memory at a given moment of time. + +The algorithm introduces a checkpoint id. This id starts at 0 when a +distributed dictionary is created. It is a 64-bit unsigned which can wrap around +(but probably won't). When a client wants to checkpoint it will invoke its +checkpoint function which will simply increment the checkpoint id modulo the +maximum 64-bit value (it will automatically wrap in C/C++, but will be added mod +the max 64-bit size in Python). No interaction with the managers or orchestrator +is necessary when a client checkpoints. + +Another goal of checkpointing is to keep from copying data unnecessarily. All +distributed dictionary client operations will include their checkpoint id. +Creation, Read, and Write operations will proceed as follows in the subheadings +below. + +When a checkpointing distributed dictionary is created, it may be told to persist +for every *persist_freq* number of checkpoints. When persisted, an argument, +*persist_base* will provide the base filename. A unique filename for the +checkpoint id and the particular manager id is constructed and used when +persisting a checkpoint. + +Dictionary Creation +______________________ + +Internally to each manager, one dictionary per checkpoint copy in the working set +is maintained. For our purposes we'll call each working set item a checkpoint. A +checkpoint is a local, internal Python dictionary inside the manager. A manager +manages one shard of the distributed data and one checkpoint corresponds to one +internal dictionary that keeps track of the key/value pairs that were modified +during that checkpoint lifetime. + +When a dictionary is created, one pool is created which is big enough to hold all +the working set of the distributed dictionary. By doing this, no copying of +values is necessary between working sets. Working sets are maintained entirely by +the internal manager dictionaries that manage the keys and values. + +.. figure:: images/manager.png + :scale: 50% + :name: manager_pic + + **Detailed View of Manager** + +The internal dictionaries are maintained in a map of the working set, called +*working_set* as pictured in :numref:`manager_pic`. A map (i.e. another internal +dictionary) called *working_set*, maps the checkpoint number to a tuple of the +deleted keys and the checkpoint's working dictionary. Initially the checkpoint id +is 0 for all clients and each dictionary in the working_set of each manager is +empty. The working set contains dictionaries for 0, 1, 2, on up to the +working_set_size to begin. + +For each working set member (i.e. checkpoint), except the oldest, there is a +deleted_keys set. This is a set of all keys that were deleted for a given +checkpoint id. If a new value is stored in the working set for a checkpoint id +level, then the key is removed from the deleted_keys set. The deleted_keys set is +added to when a key is deleted from a checkpoint, but it exists at an older +checkpoint. Otherwise, the key is just deleted from the checkpoint in which it +exists. + +.. code-block:: Python + :linenos: + :name: ddict_create_proto + :caption: **Creating a Distributed Dictionary** + + d = DDict(managers_per_node=None, num_nodes=None, total_mem=None, working_set_size=1, + wait_for_keys=False, wait_for_writers=False, policy=None, persist_freq=None, + persist_base_name=None, timeout=None) + +:numref:`ddict_create_proto` shows the signature for creating a distributed +dictionary. When creating a distributed dictionary it is possible to specify the +number of managers and number of nodes on which the distributed dictionary will +be deployed. It is also possible to determine in more detail, for example on +which nodes, a distributed dictionary will be deployed by providing a policy on +the policy argument. The working set size may be specified as described above. A +working set size of one means that no checkpointing will occur. A size of two or +more allows checkpointing. Creating a working set size of more than two enables +additional distributed parallelism by allowing clients to operate more +independently. + +In certain use cases it may be that there are a set of keys that should not +persist between checkpoints AND that all keys written into one checkpoint should +be written into all checkpoints. To get this behavior, the *wait_for_keys* +argument should be set to *True*. In this case, it will be desirable to wait for +all such keys to be written. In this case, keys that are written as *d[key] = +value* will be assumed to be part of this set of keys. A second method of storing +key/value pairs by writing *d.pput(key, value)* will result in writing a +key/value that persists across checkpoints (i.e. persistent put) and will not be +a part of the set of keys that are waited upon. + +Waiting for keys means that a client that does a read or write for a checkpoint +that currently does not exist in the working set (i.e. beyond the end of the +working set) will block until all the keys in the retiring checkpoint have been +written and all clients have moved on to new checkpoints. + +With this mode of operation, readers of key/values in a checkpoint will also block +if the key is not yet available. Once available, the reader will get the value +and continue with execution. + +All blocking operations are subject to a timeout. A timeout of *None* indicates +to the distributed dictionary that clients want to wait forever for their +requests to be satisfied. Specifying timeout values is application specific, but +providing a timeout is a good idea. When creating a dictionary, the timeout that is +specified is propagated to the orchestrator and through it to all managers as well +providing a global timeout to the entire distributed dictionary and all operations +that should be subject to a timeout. A default value of 10 seconds is provided, but +this may be overridden by providing a timeout when the dictionary is constructed. + +A less restrictive option is to set *wait_for_writers* to *True* when creating +the distributed dictionary. In this case, all keys persist in the distributed +dictionary across checkpoints, but all writers must have advanced their +checkpoint before a checkpoint can be retired. It is assumed in this mode that +writers that have written to the dictionary in the past will also be writing the +same keys in the future. In this case then the distributed dictionary manager can +monitor all writers and require that they have moved on from a checkpoint before +retiring an older one. + +Under the *wait_for_writers* requirements, a writer requesting to move on from a +checkpoint will wait (i.e. block) if there are other writers that are still +writing at a checkpoint that would be retired. If a reader moves on to new +checkpoints, then it would continue, unblocked since keys persist across +checkpoints and reader that are reading at a newer checkpoint can still see +key/value pairs written in the past. + +These subtle differences in blocking and distributed dictionary behavior should +be carefully considered when writing an application. They provide different +behavior and synchronization opportunities. + +The persist_freq refers to a frequency that the distributed dictionary should be +persisted. It will be persisted as checkpoints are retired. The frequency refers +to how often a retiring checkpoint should be persisted. The persist_base_name is +used to determine the name of the persisted state. The manager_id of the manager +is appended to the base name followed by the checkpoint id of the persisted +state. + +Retiring and Persisting Checkpoints +_________________________________________ + +A checkpoint that will no longer be in the working set is removed from the +working set and retired. This is done with little to no impact on the processes +that are using the dictionary. When the checkpoint is about to be retired a few +checks are made depending on the options that were selected when the dictionary +was created. + +Checkpoints are retired when a client attempts to write into a new checkpoint that +has not been set up yet. In that case, the oldest checkpoint is retired and a new +checkpoint is created. + +If *wait_for_keys* was chosen when the dictionary was created, then all +non-persisting keys in the retiring checkpoint must exist in the next newer +checkpoint. If they do not, then the request to move to a new checkpoint will be +enqueued until later on an internal queue of requests to be processed later. +Under this option, even reads of non-persisting keys will be queued if the newer +checkpoint id does not exist. Any operations that attempt to get the state for a +newer checkpoint that depends on the keys of the newer checkpoint will also be +enqueued until the newer checkpoint exists. + +If *wait_for_keys* was chosen when the dictionary was created, then the retiring +checkpoint keys that are in the set of keys to persist are deep copied to +the next newer checkpoint unless the next newer checkpoint has the key in its +deleted_keys set or the key is already present in the newer checkpoint. + +If *wait_for_writers* was chosen when the dictionary was created, then all +writers into a checkpoint must have advanced to a checkpoint id greater +than the one to be retired before the checkpoint can be retired. If all +writers have not advanced, then the request to move to a new checkpoint +will be queued internally until it can be processed. + +If *wait_for_keys* was not specified on creation of the dictionary, then all keys +are treated as persisting keys when checkpoints are retired. + +If the retiring checkpoint is to be persisted, then all key/value pairs in the retiring +checkpoint are written to persistent storage. The retiring checkpoint's internal +dictionary is handed over to a process to persist the values to disk. As it does +so, the key/value pairs are deleted from the pool of the manager, thereby +releasing those resources. The persisting of the data can occur completely +independent of client interactions with the distributed dictionary. There are no +shared data resources except the pool which is already multi-thread and +multi-process safe. Otherwise there are no shared resources. + +One possible design for persisting to disk is to form DDPut messages (or another +capnp message) for each key/value pair in the pool and write them to a file +descriptor which represents a file opened by the manager. The captain proto +library supports writing to a file descriptor and we have a message already that +contains the checkpoint number, the key, and the value. When recovery was +initiated, a process could open the file, read the messages, and route the +messages right into the manager to restore it to that point in time. + +.. figure:: images/working_set.png + :scale: 50% + :name: workingset_pic + + **Working Set** + +Consider the working set given in :numref:`workingset_pic` for a dictionary with +all persistent keys. The figure shows that checkpoint 0, 1, 2, and 3 are in the +working set. During checkpoints 0, 1, and 3 the key *key1* was written into the +distributed dictionary. During checkpoint 2 a *keyA* was written into the +dictionary. During checkpoint 1 the *keyB* was written into the dictionary. But +during checkpoint 2 *keyB* was deleted from the dictionary. + +Now, if a client comes along that's got checkpoint 3 as its checkpoint id, and +looks up *keyB* it will not be found. However if another client currently at +checkpoint 1 comes along, it will discover *keyB* in the dictionary. For any key +the corresponding value also exists. + +The pool can hold duplicates of keys and values. The pool has no restrictions on +what can be stored within it. Each dictionary at each checkpoint is a separate +dictionary so the keys and values stored at each checkpoint are completely +independent of what is stored at other checkpoints. + +Assuming that the working set size of this dictionary is 4, then when a new +checkpoint comes along it will result in checkpoint 0 being retired. Since *key1* +exists at checkpoint 2, the *key1* from checkpoint 0 is simply deleted from the +pool and the dictionary is replaced by a new empty dictionary for checkpoint 4. + +Given the current state in :numref:`workingset_pic` a call to get the length of +the dictionary would result in finding 2 keys, *key1* and *keyA*. This can be +found by constructing a temporary set of keys. Starting with the keys of +checkpoint 0, add all the keys of checkpoint 1, then delete all deleted keys of +checkpoint 1. Add in all keys of checkpoint 2 and then delete all deleted keys +from the temporary set. Repeat this process for all checkpoint levels in the +working set. Then take the length of the computed temporary set and that gives +you the length of the dictionary, i.e. the number of keys active at any point in +time. Similarly, a call to find all keys at a particular checkpoint level can be +found using this algorithm. + +Read Operations +_________________ + +Read operations include get operations but also all operations that examine the +state of the distributed dictionary. A read operation includes the checkpoint +index. Here is how it proceeds: + + * Client sends get operation to correct manager with checkpoint id, chkpt_id. + + * If the chckpt_id is older than any checkpoint id in the working set, the oldest + checkpoint copy will be examined since that contains the base copy. If + *wait_for_keys* was specified and a reader tries to read a + non-persisting key older than the working set, the read is rejected. + + * If the chckpt_id is newer than any other checkpoint id in the working set, + then no worries. We use the newest chkpt_id we have in the working set in that + case unless *wait_for_keys* was specified and this is a non-persisting key. In + that case, the reader's request is queued internally until it can be processed. + + * Manager receives the message and examines the + working_set[checkpoint_map[chkpt_id]] dictionary for the key. If the value + is found, great! Return it. + + * If the key is found in the set of deleted keys for a checkpoint then return + that it was not found. + + * If the key being looked up is not found and the key is a persisting key + (i.e. because *wait_for_keys* was requested), then examine the next + older checkpoint in the working set by looking at the checkpoint + dictionary and also looking at the deleted_keys set. Keep repeating + this until the working set is exhausted or until the key is found. + Once the key is found, return its value or return not found depending + on where the key was found. + + * If the key being looked up is not found and is not in the set of persisting + keys (i.e. and *wait_for_keys* was requested) then queue up the request + internally until it can be satisfied. + + * If the working set is exhausted, then report that the key was not found. + +For operations where you are given a key, like *contains* for instance, the algorithm +is similar to the one above. + +For operations that need global state (like the length operation), you can do +set operations to form the correct state information. For instance for length, you +would take the union of all the keys in the working set subtracting out the set of +deleted keys. This will give you the total length of the dictionary. This is best +computed from oldest to newest. + +Write Operations +_________________ + +There are two types of write/put operations: one for persistent keys and one for +non-persisting keys. When *wait_for_keys* is *True* then *DDPut* is for +non-persisting keys, otherwise it stores persisting keys. The *DDPPut* is the +persistent put operation. Exactly what occurs on a put is different for +persistent and non-persistent puts. + +On persistent puts, steps proceed as follows: + +Puts (and deletes) into the distributed dictionary come to a manager. Each put +operation now includes a chkpt_id that identifies which checkpoint it is written +into. If the chckpt_id does not exist in the working set of the manager, the +working set is rotated until it does. Rotating is described in the earlier +*Retiring and Persisting Checkpoints* heading. + +A put operation then creates a new entry for the current checkpoint if the key +does not already exist in the indicated checkpoint and updates the value if the +key already does exist in the current checkpoint. + +If the key is deleted it is removed from the checkpoint dictionary if it exists +and if the key exists in an older checkpoint, then it is also added to the set of +deleted_keys for the checkpoint. + +If a put or delete is targeting a checkpoint that no longer exists in the working +set then it updates the oldest copy. + +For non-persistent puts, the checkpoint id must be in the working set or newer. +If it is older than the working set then the put operation is rejected. If it is +newer than all checkpoints in the working set, then the oldest checkpoint is +examined and if it does not contain all the non-persisting keys of the next newer +checkpoint, then the put request is internally queued up for later processing. + + +Message Flow Between Components +_________________________________________ + +The following sections illustrate the flow of messages between components of the +distributed dictionary. All messages are hidden behind the distributed dictionary +API. These are internal details of the implementation. + +Bringup, Attach, Detach +------------------------- + +Creating a distributed dictionary involves a Python client providing information +about how many managers, the number of nodes, the total amount of memory, and a +policy for where to place managers. The following diagram provides the details of +interactions between components for distributed dictionary bringup. Message +definitions appear in the aptly named section below. There are a few notes here +about this flow. + + * The client/manager attach flow (see the diagram) is not necessary + when a client has sent the *DDRegisterClient* to a manager. In other + words, the *DDRegisterClient* does all the work of the + DDRegisterClientID* message when it is sent to a manager so it does not + need to be repeated. + + * Not pictured in the diagram, the Orchestrator supports the + *DDGetRandomManager* message and respond to it since some clients may have + been started on a node without a manager. When that occurs the + client will receive a *Not Found* response to the *SHGetKV* + message. In that case the client should fall back to sending the + *DDGetRandomManager* message to the Orchestrator + + * Each Manager and the Orchestrator are assigned a range of + client IDs to assign. The Managers get 100,000 each based on the manager ID + and starting at 0. The Orchestrator gets the rest. In this way no two + clients will get the same client ID. Necessarily, client IDs will not + be sequentially allocated across all nodes. + +.. figure:: images/ddict_bringup.srms1.png + :scale: 75% + :name: ddict_bringup + + **The Distributed Dictionary Bringup, Attach, Detach Sequence Diagram** + + +Teardown +---------- + +Bringing down a distributed dictionary is initiated by one client. Other clients +should already be aware the dictionary is being destroyed. If not, they will +begin to get errors when interacting with the dictionary since channels will no +longer exist. + +.. figure:: images/ddict_teardown.srms1.png + :scale: 75% + :name: ddict_teardown + + **The Distributed Dictionary Teardown Sequence Diagram** + +Put and Get Interaction +------------------------ + +Puts and gets are initiated by client programs. The key is hashed by the client +program's put or get API call and divided by the number of managers to obtain the +integer remainder (modulo operator) value. That value picks which manager is +responsible for the put or get operation for the given key. It is imperative that +all clients use the same hashing function and that all managers are in the same +order for all clients. + +Put and get operations are designed to minimize the number of copies of data that +are made when they are performed. By having each manager create their own FLI +stream channels, keys and values sent to the manager are automatically allocated +from the manager's pool since allocations sent to a channel use the same +channel's pool by default within the dragon api. + +Internally to managers they see only managed memory allocations. Each key is one +allocation. Values are streamed to the manager through the file-like interface, +so values are typically a sequence of managed memory allocations. The internal +dictionary of each manager is a map from a managed memory allocation to a list of +managed memory allocations. + +.. figure:: images/ddict_put.srms1.png + :scale: 75% + :name: ddict_put + + **The Distributed Dictionary Put Sequence Diagram** + +Likewise, as shown in :numref:`ddict_get` the value is streamed back to the +client on a get operation. Closing the send handle results in the EOT being +transmitted. The client simply reads values for multi-part values until EOT is +signaled. In the low-level interface this surfaces as DRAGON_EOT return code. In +Python it is signalled by an EOFError exception. + +.. figure:: images/ddict_get.srms1.png + :scale: 75% + :name: ddict_get + + **The Distributed Dictionary Get Sequence Diagram** + + +Pop +----- + +.. figure:: images/ddict_pop.srms1.png + :scale: 75% + :name: ddict_pop + + **The Distributed Dictionary Pop Sequence Diagram** + +Contains +---------- + +.. figure:: images/ddict_contains.srms1.png + :scale: 75% + :name: ddict_contains + + **The Distributed Dictionary Contains Sequence Diagram** + +Length +----------- + +.. figure:: images/ddict_getLength.srms1.png + :scale: 75% + :name: ddict_get_length + + **The Distributed Dictionary Get Length Sequence Diagram** + +Clear +-------- + +.. figure:: images/ddict_clear.srms1.png + :scale: 75% + :name: ddict_clear + + **The Distributed Dictionary Get Length Sequence Diagram** + +Get All Keys +--------------- +.. figure:: images/ddict_keys.srms1.png + :scale: 75% + :name: ddict_keys + + **The Distributed Dictionary Get Length Sequence Diagram** + + +.. _pythonDDictClient: + +Python Reference +_________________ + +.. currentmodule:: dragon.data.ddict + +.. autosummary:: + :toctree: + :recursive: + + ddict + orchestrator + manager + +.. _DragonDDictCClient: + +C Reference +______________ + +.. contents:: Table of Contents + :local: + +Description +------------- + +The distributed dictionary C client description. + + +Example +------------ + +Here is some sample code for creating an FLI using only the C interface. This +example code was taken from the test/channel_subtests/test_fli.c program. Note +that for this to be useful production code, resources should be created using a +means of guaranteeing unique CUIDs for channels and if needed, unique MUIDs for +pools. For instance, you might use the dragon_create_process_local_channel API +call to create guaranteed unique CUIDs for you channels. + +.. code-block:: C + :linenos: + :caption: **Using the Distributed Dictionary C Client** + + #include + #include + + #define M_UID 0 + #define POOL_M_UID 2 + #define POOL "fli_test" + #define NUM_CHANNELS 10 + #define MAX_STREAM_SIZE 500 + + dragonError_t create_pool(dragonMemoryPoolDescr_t* mpool) { + /* Create a memory pool to allocate messages and a Channel out of */ + size_t mem_size = 1UL<<31; + + dragonError_t err = dragon_memory_pool_create(mpool, mem_size, POOL, POOL_M_UID, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to create memory pool"); + + return DRAGON_SUCCESS; + } + + dragonError_t create_channels(dragonMemoryPoolDescr_t* mpool, dragonChannelDescr_t channel[], int arr_size) { + int k; + dragonError_t err; + + for (k=0;k` + +#. **DDGetRandomManagerResponse** + + *type enum* + DD_GET_RANDOM_MANAGER_RESPONSE + + *purpose* + Orchestrator return fli of a main manager for clients to request connection to other managers. + + *fields* + + **mainFLI** + - string + - b64 encoded serialized main fli of the manager. + + *see also* + DDGetRandomManager + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDRegisterClient** + + *type enum* + DD_REGISTER_CLIENT + + *purpose* + Obtain unique client id from main manager and register client id to main manager. + + *fields* + + **response_fli** + - string + - b64 encoded serialized fli for response. + + **buffered_response_fli** + - string + - b64 encoded serialized fli for buffered response. + + *see also* + DDRegisterClientResponse, DDRegisterClientID + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDRegisterClientResponse** + + *type enum* + + DD_REGISTER_CLIENT_RESPONSE + + *purpose* + Provide the unique client id and number of managers. + + *fields* + + **client_id** + - uint32 + - unique for this client. + + **num_managers** + - uint32 + - number of managers in the dictionary. + + *see also* + DDRegisterClient + + Refer to the :ref:`cfs` section for additional response message fields. + + *implementation(s):* :func:`Python` + +#. **DDRegisterManager** + + *type enum* + DD_REGISTER_MANAGER + + *purpose* + Manager registers with Orchestrator and get a list of managers from Orchestrator. + + *fields* + + **response_fli** + - string + - b64 encoded serialized fli for the response to this request. + + **mainFLI** + - string + - b64 encoded serialized fli for the main channel for the manager. + + *see also* + DDRegisterManagerResponse + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDRegisterManagerResponse** + + *type enum* + DD_REGISTER_MANAGER_RESPONSE + + *purpose* + Provide the acknowledgement that the manager is registered and a list of managers. This serves as a + synchronization point for client/manager interaction. Clients can request other manager's fli from the main manager assigned to them. + + *fields* + + **manager_id** + - uint32 + - unique for this manager. + + **managers** + - list + - a list of b64 encoded serialized flis for the main channels of all managers. + + *see also* + DDRegisterManager + + Refer to the :ref:`cfs` section for additional response message fields. + + *implementation(s):* :func:`Python` + +#. **DDConnectToManager** + + *type enum* + DD_CONNECT_TO_MANAGER + + *purpose* + Obtain the manager mainFLI from the main manager so a client can attach to the manager. + + *fields* + + **client_id** + - uint32 + - unique client id assigned by main manager. + + **manager_id** + - uint32 + - the ID of the manager that client requests to connect to. + + *see also* + DDConnectToManagerResponse + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDConnectToManagerResponse** + + *type enum* + DD_CONNECT_TO_MANAGER_RESPONSE + + *purpose* + return the mainFLI of the manager whose ID was provided on the request. + + *fields* + + **mainFLI** + - string + - b64 encoded serialized fli for the main channel for the manager. + + *see also* + DDConnectToManager + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + + +#. **DDRegisterClientID** + + *type enum* + DD_REGISTER_CLIENT_ID + + *purpose* + Register the client ID and associated client response fli with a manager so the + response fli does not need to be included in future messages and client ID can be + used instead. + + *fields* + + **client_id** + - uint32 + - unique client id assigned by main manager. + + **response_fli** + - string + - b64 encoded serialized response fli for client requests. + + **buffered_response_fli** + - string + - b64 encoded serialized response fli for client requests. + + *see also* + DDRegisterClientIDResponse, DDRegisterClient + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDRegisterClientIDResponse** + + *type enum* + DD_REGISTER_CLIENT_ID_RESPONSE + + *purpose* + Provide the acknowledgement that the client is registered with the manager. + This serves as a synchronization point for client/manager interaction. + + *fields* + + **None other than the err field which will hold a dragon return code.** + + *see also* + DDRegisterClientID, DDRegisterClient + + Refer to the :ref:`cfs` section for additional response message fields. + + *implementation(s):* :func:`Python` + +#. **DDDestroy** + + *type enum* + DD_DESTROY + + *purpose* + Sent by a client to the orchestrator to destroy the distributed dictionary. + + *fields* + + **client_id** + - uint32 + - The client id of the requesting client. + + **response_fli** + - string + - b64 encoded serialized response fli. + + *see also* + DDDestroyResponse, DDDestroyManager + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDDestroyResponse** + + *type enum* + DD_DESTROY_RESPONSE + + *purpose* + Provide the acknowledgement that the distributed dictionary destruction has + completed. + + *fields* + + **None other than the err field which will hold a dragon return code.** + + *see also* + DDDestroy + + Refer to the :ref:`cfs` section for additional response message fields. + + *implementation(s):* :func:`Python` + +#. **DDDestroyManager** + + *type enum* + DD_DESTROY_MANAGER + + *purpose* + Sent by the orchestrator to destroy a distributed manager. + + *fields* + + **response_fli** + - string + - b64 encoded serialized response fli. + + *see also* + DDDestroyManagerResponse, DDDestroy + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDDestroyManagerResponse** + + *type enum* + DD_DESTROY_MANAGER_RESPONSE + + *purpose* + Provide the acknowledgement that the distributed dictionary manager destruction has + completed. + + *fields* + + **None other than the err field which will hold a dragon return code.** + + *see also* + DDDestroyManager + + Refer to the :ref:`cfs` section for additional response message fields. + + *implementation(s):* :func:`Python` + +#. **DDPut** + + *type enum* + DD_PUT + + *purpose* + Sent by a client to put a key/value pair into the distributed dictionary. It is sent + to a particular manager which is chosen by pre-hashing the key and dividing modulo the + number of managers. + + *fields* + + **client_id** + - uint32 + - The client id of the requesting client. + + **chkpt_id** + - uint64 + - The checkpoint identifier for this operation. + + *NOTE* The key and value are written separately from the message using the fli api. + + *see also* + DDPutResponse + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDPutResponse** + + *type enum* + DD_PUT_RESPONSE + + *purpose* + Provide the acknowledgement that the distributed dictionary manager that the + put has completed. + + *fields* + + **None other than the err field which will hold a dragon return code.** + + *see also* + DDPut + + Refer to the :ref:`cfs` section for additional response message fields. + + *implementation(s):* :func:`Python` + +#. **DDGet** + + *type enum* + DD_GET + + *purpose* + Sent by a client to a manager to get a value for a key. + + *fields* + + **client_id** + - uint32 + - The client id of the requesting client. + + **chkpt_id** + - uint64 + - The checkpoint identifier for this operation. + + *NOTE* The key is written separately from the message using the fli api. + + *see also* + DDGetResponse + + Refer to the :ref:`cfs` section for additional request message fields. + + *implementation(s):* :func:`Python` + +#. **DDGetResponse** + + *type enum* + DD_GET_RESPONSE + + *purpose* + Provide the value for the associated key or an error code indicating what happened. + + *fields* + + **None other than the err field which will hold a dragon return code.** + + *NOTE* The value is written separately from the message using the fli api. + + *see also* + DDGet + + Refer to the :ref:`cfs` section for additional response message fields. + + *implementation(s):* :func:`Python` + diff --git a/doc/ref/data/images/ddict_bringup.srms b/doc/ref/data/images/ddict_bringup.srms new file mode 100644 index 0000000..fa760cd --- /dev/null +++ b/doc/ref/data/images/ddict_bringup.srms @@ -0,0 +1,106 @@ +8 18 +begin components +py "Python Client Program" +c1 "Client Prog 2" +orc "Orchestrator" +m1 "Manager 1" +m2 "Manager 2" +ls "Local Services (per node)" +end; +ltext@ py "Python Program creates a Distributed Dictionary with ddict(...)"; +ltext@ py "Python Program creates the Orchestrator via native Popen with stdout redirected."; +ltext@ py "Orchestrator attaches to default pool, creates its main fli channel."; +; +orc py "Orchestrator writes serialized descriptor to FLI to stdout."; +; +ltext@ py "Python Program reads serialized descriptor from stdout of orc."; +; +py orc "DDCreate(respFLI, args of create)"; +; +orc m1 "Managers are created via ProcessGroup with serialized (RespFLI, OrcMainFLI) of orchestrator."; +; +ltext@ m1 "Each manager creates pool, main channel (fli)"; +; +ltext@ m1 "Each manager registers the Orchestrator's mainFLI as a key."; +ltext@ m1 "This occurs to distribute client connections to nodes."; +ltext@ m1 "Multiple managers on a node will result in only last in LS KV"; +ltext@ m1 "Store which is OK."; +; +m1 ls "SHSetKV(OrcMainFLI, MainManagerFLI)"; +m2 ls "SHSetKV(OrcMainFLI, MainManagerFLI)"; +ls m1 "SHSetKVResponse"; +ls m2 "SHSetKVResponse"; +ltext@ orc "Registration to Orc occurs so all managers discover all other manager FLIs"; +ltext@ orc "This allows clients to attach to dictionary by interacting with the manager"; +ltext@ orc "on its own node. This allows attaching to be distributed."; +; +m1 orc "DDRegisterManager(RespFLI, m1ManagerMainFLI)"; +m2 orc "DDRegisterManager(RespFLI, m2ManagerMainFLI)"; +orc m1 "DDRegisterManagerResponse(list(ManagerMainFLIs)) to RespFLI"; +orc m2 "DDRegisterManagerResponse(list(ManagerMainFLIs)) to RespFLI"; +; +ltext@ py "All managers started and registered. Creation complete."; +; +orc py "DDCreateResponse"; +; +ltext@ py "Orchestrator switches to processing main fli"; +; +py ls "SHGetKV(OrcMainFLI)"; +ls py "SHGetKVResponse(ManagerMainFLI)"; +py m2 "DDRegisterClient(RespFLI, BufferedRespFLI) sent to manager"; +m2 py "DDRegisterClientResponse(numManagers, clientID)"; +; +ltext@ py "Client is now ready for Distributed Dictionary interaction"; +ltext@ py "The ddict object can be shared with other multiprocessing processes."; +ltext@ py "It can also be serialized and the serialized descriptor passed to"; +ltext@ py "lower level C or C++ code (and Fortran indirectly via C)."; +ltext@ py "By some other means, the serialized fli for the Distributed Dictionary is shared."; +; +c1 ls "SHGetKV(OrcMainFLI)"; +ls c1 "SHGetKVResponse(ManagerMainFLI)"; +c1 m2 "DDRegisterClient(RespFLI, BufferedRespFLI) sent to mainFLI of chosen manager"; +ltext@ m1 "Manager picks clientID in its allowable range"; +m2 c1 "DDRegisterClientResponse(numManagers, clientID)"; +; +ltext@ py "Client code called setitem and Manager 1 is chosen given key"; +ltext@ py "First connection to Manager 1"; +; +ltext@ py "Client/Manager ATTACH Protocol is only done on first interaction with a manager."; +ltext@ py "Start of Client/Manager ATTACH Protocol for a chosen manager."; +; +py m2 "DDConnectToManager(manager index 1) to main of manager on node"; +; +m2 py "DDConnectToManagerResponse(managerMainFLI of m1)"; +; +py m1 "DDRegisterClientID(RespFLI, BufferedRespFLI, clientID) to main of Manager"; +; +m1 py "DDRegisterClientIDResponse(client registered to m1)"; +; +ltext@ py "End of the Client/Manager ATTACH Protocol"; +; +py m1 "DDPut operation (see detailed diagram on putting values)"; +; +m1 py "DDPut response"; +; +c1 m2 "DDConnectToManager(manager index 1) to main of manager on node"; +; +m2 c1 "DDConnectToManagerResponse(managerMainFLI of m1)"; +; +c1 m1 "DDRegisterClientID(BufferedRespFLI, clientID) to main of Manager"; +; +m1 c1 "DDRegisterClientIDResponse"; +; +c1 m1 "DDPut operation (see detailed diagram on putting values)"; +; +m1 c1 "DDPut response"; +; +ltext@ py "Client decides to detach from the distributed dictionary"; +; +py m1 "DDDeregisterClient (BufferedRespFLI, clientID)"; +py m2 "DDDeregisterClient (BufferedRespFLI, clientID)"; +m1 py "DDDeregisterClientResponse"; +m2 py "DDDeregisterClientResponse"; +; +ltext@ py "Deregistering only occurs with managers to which a client was attached."; +; +ltext@ py "All defunct FLIs are now detached from managers and client"; diff --git a/doc/ref/data/images/ddict_clear.srms b/doc/ref/data/images/ddict_clear.srms new file mode 100644 index 0000000..88e30e7 --- /dev/null +++ b/doc/ref/data/images/ddict_clear.srms @@ -0,0 +1,15 @@ +8 4 +begin components +c "User Program" +m1 "Manager 1" +m2 "Manager 2" +end; +ltext@ c "Client program invokes clear to all managers"; +ltext@ c "Then all managers remove all kay-value pairs"; +ltext@ c "Client receive response from managers"; +; +c m1 "DDClear(clientID, chkptID) sent to manager1 main fli."; +c m2 "DDClear(clientID, chkptID) sent to manager2 main fli."; +; +m1 c "DDClearResponse(status=DRAGON_SUCCESS), sent to client's buffered fli"; +m2 c "DDClearResponse(status=DRAGON_SUCCESS), sent to client's buffered fli"; \ No newline at end of file diff --git a/doc/ref/data/images/ddict_contains.srms b/doc/ref/data/images/ddict_contains.srms new file mode 100644 index 0000000..0b1a503 --- /dev/null +++ b/doc/ref/data/images/ddict_contains.srms @@ -0,0 +1,15 @@ +8 4 +begin components +c "User Program" +m1 "Manager 1" +m2 "Manager 2" +end; +ltext@ c "Client program invokes to check key existence"; +ltext@ c "Then it hashes the key once key"; +ltext@ c "It chooses the manager by computing the remainder of"; +ltext@ c "dividing by the number of managers."; +; +c m2 "DDContains(clientID, chkptID) sent to chosen manager main fli."; +c m2 "Key - not a message"; +c m2 "EOT (low-level fli protocol)"; +m2 c "DDContainsResponse(status=DRAGON_SUCCESS) sent to client's buffered fli"; \ No newline at end of file diff --git a/doc/ref/data/images/ddict_get.srms b/doc/ref/data/images/ddict_get.srms new file mode 100644 index 0000000..03b370e --- /dev/null +++ b/doc/ref/data/images/ddict_get.srms @@ -0,0 +1,20 @@ +8 5 +begin components +c "User Program" +m1 "Manager 1" +m2 "Manager 2" +end; +ltext@ c "Client program invokes get on a key"; +ltext@ c "The Client buffers all writes for the key value."; +ltext@ c "Then it hashes the key once key writing is complete"; +ltext@ c "It chooses the manager by computing the remainder of"; +ltext@ c "dividing by the number of managers."; +; +c m2 "DDGet(clientID, chkptID) sent to chosen manager main fli."; +c m2 "Key value is written on one write - not a message"; +c m2 "Send handle closed resulting in EOT (low-level fli protocol)"; +m2 c "DDGetResponse(err=DRAGON_SUCCESS) sent to client RespFLI"; +m2 c "Value part 1"; +m2 c "Value part ..."; +m2 c "Value part n"; +m2 c "EOT (low-level fli protocol)"; \ No newline at end of file diff --git a/doc/ref/data/images/ddict_getLength.srms b/doc/ref/data/images/ddict_getLength.srms new file mode 100644 index 0000000..e1d6b02 --- /dev/null +++ b/doc/ref/data/images/ddict_getLength.srms @@ -0,0 +1,15 @@ +8 4 +begin components +c "User Program" +m1 "Manager 1" +m2 "Manager 2" +end; +ltext@ c "Client program invokes get length from all managers"; +ltext@ c "Then all managers send their length of kvs to client"; +ltext@ c "Client sum up the length"; +; +c m1 "DDGetLength(clientID, chkptID) sent to all managers' main fli."; +c m2 "DDGetLength(clientID, chkptID) sent to all managers' main fli."; +; +m1 c "DDGetLengthResponse(status=DRAGON_SUCCESS) sent to client's buffered fli"; +m2 c "DDGetLengthResponse(status=DRAGON_SUCCESS) sent to client's buffered fli"; diff --git a/doc/ref/data/images/ddict_keys.srms b/doc/ref/data/images/ddict_keys.srms new file mode 100644 index 0000000..ec7423d --- /dev/null +++ b/doc/ref/data/images/ddict_keys.srms @@ -0,0 +1,25 @@ +8 6 +begin components +c "User Program" +m1 "Manager 1" +m2 "Manager 2" +end; +ltext@ c "Client program invokes to get all keys from all managers"; +ltext@ c "Then all managers send their keys to client"; +ltext@ c "Client gathers keys and return it"; +; +c m1 "DDKeys(clientID, chkptID) sent to manager1 main fli."; +c m1 "EOT (low-level fli protocol)"; +m1 c "DDKeysResponse(status=DRAGON_SUCCESS, keys_length=n)"; +m1 c "Key 1"; +m1 c "Key ..."; +m1 c "Key n"; +m1 c "EOT (low-level fli protocol)"; +; +c m2 "DDKeys(clientID) sent to manager2 main fli."; +c m2 "EOT (low-level fli protocol)"; +m2 c "DDKeysResponse(status=DRAGON_SUCCESS, keys_length=n)"; +m2 c "Key 1"; +m2 c "Key ..."; +m2 c "Key n"; +m2 c "EOT (low-level fli protocol)"; \ No newline at end of file diff --git a/doc/ref/data/images/ddict_overview.png b/doc/ref/data/images/ddict_overview.png new file mode 100644 index 0000000..56c3b83 Binary files /dev/null and b/doc/ref/data/images/ddict_overview.png differ diff --git a/doc/ref/data/images/ddict_pop.srms b/doc/ref/data/images/ddict_pop.srms new file mode 100644 index 0000000..e4a986a --- /dev/null +++ b/doc/ref/data/images/ddict_pop.srms @@ -0,0 +1,20 @@ +8 5 +begin components +c "User Program" +m1 "Manager 1" +m2 "Manager 2" +end; +ltext@ c "Client program invokes get on a key"; +ltext@ c "The Client buffers all writes for the key value."; +ltext@ c "Then it hashes the key once key writing is complete"; +ltext@ c "It chooses the manager by computing the remainder of"; +ltext@ c "dividing by the number of managers."; +; +c m2 "DDPop(clientID, chkptID) sent to chosen manager main fli."; +c m2 "Key - not a message"; +c m2 "EOT (low-level fli protocol)"; +m2 c "DDPopResponse(err=DRAGON_SUCCESS)"; +m2 c "Value part 1"; +m2 c "Value part ..."; +m2 c "Value part n"; +m2 c "EOT (low-level fli protocol)"; \ No newline at end of file diff --git a/doc/ref/data/images/ddict_put.srms b/doc/ref/data/images/ddict_put.srms new file mode 100644 index 0000000..9c88dc9 --- /dev/null +++ b/doc/ref/data/images/ddict_put.srms @@ -0,0 +1,19 @@ +8 5 +begin components +c "User Program" +m1 "Manager 1" +m2 "Manager 2" +end; +ltext@ c "Client program invokes put on a key/value pair"; +ltext@ c "The Client buffers all writes for the key value."; +ltext@ c "Then it hashes the key once key writing is complete"; +ltext@ c "It chooses the manager by computing the remainder of"; +ltext@ c "dividing by the number of managers."; +; +c m2 "DDPut(clientID, chkptID) sent to manager main fli."; +c m2 "Key value is written on one write - not a message"; +c m2 "Value part 1"; +c m2 "Value part ..."; +c m2 "Value part n"; +c m2 "EOT (low-level fli protocol)"; +m2 c "DDPutResponse(status=DRAGON_SUCCESS) sent to client's buffered fli"; diff --git a/doc/ref/data/images/ddict_teardown.srms b/doc/ref/data/images/ddict_teardown.srms new file mode 100644 index 0000000..362c555 --- /dev/null +++ b/doc/ref/data/images/ddict_teardown.srms @@ -0,0 +1,34 @@ +8 7 +begin components +c "User Program" +orc "Orchestrator" +m1 "Manager 1" +m2 "Manager 2" +ls "Local Services (one per node)" +end; +ltext@ c "Program is reponsible for destroying distributed dictionary and initiates it."; +; +c orc "DDDestroy(clientID, BufferedRespFLI)"; +orc m1 "DDDestroyManager"; +orc m2 "DDDestroyManager"; +m1 ls "SHSetKV(orcMainFLI, '') results in deleting key"; +ls m1 "SHSetKVResponse"; +m1 orc "DDDestroyManagerResponse"; +; +ctext@ m1 "Manager 1 frees channels and pools"; +ctext@ m1 "Manager 1 exits"; +; +m2 ls "SHSetKV(orcMainFLI, '')"; +ls m2 "SHSetKVResponse"; +m2 orc "DDDestroyManagerResponse"; +; +ctext@ m2 "Manager 2 frees channels and pools"; +ctext@ m2 "Manager 2 exits"; +; +orc c "DDDestroyResponse"; +; +ctext@ orc "Orchestrator frees channels"; +ctext@ orc "Orchestrator exits"; +; + +ltext@ c "Disributed Dictionary is now destroyed"; \ No newline at end of file diff --git a/doc/ref/data/images/manager.png b/doc/ref/data/images/manager.png new file mode 100644 index 0000000..f26796c Binary files /dev/null and b/doc/ref/data/images/manager.png differ diff --git a/doc/ref/data/images/working_set.png b/doc/ref/data/images/working_set.png new file mode 100644 index 0000000..38fd78a Binary files /dev/null and b/doc/ref/data/images/working_set.png differ diff --git a/doc/ref/data/index.rst b/doc/ref/data/index.rst index 51d9196..3dd6f22 100644 --- a/doc/ref/data/index.rst +++ b/doc/ref/data/index.rst @@ -1,15 +1,11 @@ -Data +Data ++++++++ -Python Reference -================ -.. currentmodule:: dragon.data.distdictionary +Dragon has APIs for managing data in a scalable fashion. A prominent member of +these APIs is the distributed dictionary. Descriptions and APIs are grouped +together for each of the supported Dragon Data Types. -.. autosummary:: - :toctree: - :recursive: - - dict_managers - distributed_dict - dragon_dict +.. toctree:: + :maxdepth: 5 + ddict.rst diff --git a/doc/ref/inf/index.rst b/doc/ref/inf/index.rst index 4d6c5f7..6d7c72a 100644 --- a/doc/ref/inf/index.rst +++ b/doc/ref/inf/index.rst @@ -49,7 +49,7 @@ Architecture .. figure:: images/infrastructure_architecture.svg :scale: 75% - :name: dragon-inf-api-architecture + :name: dragon-inf-api-architecture **Architecture of the Dragon Infrastructure API** diff --git a/doc/ref/mpbridge/index.rst b/doc/ref/mpbridge/index.rst index 36269fb..f07ac1f 100644 --- a/doc/ref/mpbridge/index.rst +++ b/doc/ref/mpbridge/index.rst @@ -1,4 +1,4 @@ -MPBridge +MPBridge ++++++++ The Dragon MPbridge component maps the Python Multiprocessing API onto :ref:`ref/native/index:Dragon Native` @@ -24,7 +24,7 @@ Components .. figure:: images/mpbridge_architecture.svg :scale: 75% - :name: mpbridge-architecture + :name: mpbridge-architecture **MPBridge architecture** @@ -54,12 +54,12 @@ Example: The Dragon Queue For example, the size of a queue in Dragon native is `q.size()`, while in Multiprocessing it is `q.qsize()`. We created a private method `q._size()` and have `q.size()` wrap it in Dragon Native. In MPBridge, we then remove the `q.size()` that DragonQueue has inherited from Dragon Native's queue and add `q.qsize()` in -DragonQueue that wraps the same private method. +DragonQueue that wraps the same private method. Next we show a class diagram of Dragons queue implementation and how it is inserted into the Multiprocessing package. .. figure:: images/mpbridge_class_diagram.svg - :name: dragon-mpbridge-queue-impl + :name: dragon-mpbridge-queue-impl **Class diagram of the mpbridge.queue implementation.** @@ -68,11 +68,11 @@ of the public interface of the three Python Multiprocessing Queues: ``Queue``, ` ``SimpleQueue``. The MPBridge component inherits from ``dragon.native.queue.Queue`` into ``dragon.mpbridge.queues.DragonQueu``, ``dragon.mpbridge.queue.DragonSimpleQueue`` and ``dragon.mpbridge.queue.DragonJoinableQueue``. The public API is modified accordingly, so that it conforms with the -Multiprocessing API. +Multiprocessing API. The MPBridge component also contains 3 functions (``Queue``, ``SimpleQueue`` and ``JoinableQueue``) that return the corresponding -classes. The are called from the ``DragonContext``. +classes. The are called from the ``DragonContext``. Just as in Multiprocessing, the methods below the context are exported during startup into the module API. The context itself is part of a list of contexts held at the top level, containing a context per start method. Setting the start method then means setting -the ``DefaultContext`` equal to one of these contexts. To add our start method to this mechanism, we add an ``AugmentedDefaultContext`` +the ``DefaultContext`` equal to one of these contexts. To add our start method to this mechanism, we add an ``AugmentedDefaultContext`` that adds our start method to the list of possible start methods and overloads the ``set_start_method`` method. diff --git a/doc/ref/native/index.rst b/doc/ref/native/index.rst index 0657c83..6acea3d 100644 --- a/doc/ref/native/index.rst +++ b/doc/ref/native/index.rst @@ -37,7 +37,7 @@ Architecture .. figure:: images/architecture.svg :scale: 75% - :name: dragon-native-architecture + :name: dragon-native-architecture **The Dragon native architecture** diff --git a/doc/services/launcher.rst b/doc/services/launcher.rst index 38451f9..fa5d0ba 100644 --- a/doc/services/launcher.rst +++ b/doc/services/launcher.rst @@ -28,7 +28,7 @@ Launcher Single Node Architecture **Single node architecture of the Launcher component** .. figure:: images/singlenodelauncher.png - :name: singlenode-launcher + :name: singlenode-launcher **Single-node Launcher/Backend Components** @@ -53,12 +53,12 @@ Launcher Multi Node Architecture ================================ .. figure:: images/launcher_multi_node.svg - :name: launcher-multi-node + :name: launcher-multi-node **Multi node architecture of the Launcher component** .. figure:: images/launchercomponents.png - :name: launcher-comps + :name: launcher-comps **Multi-node Launcher/Backend Components** @@ -337,7 +337,7 @@ may be necessary for some multi-node applications but can be used in single-node as well allowing a server application to run in either environment. .. figure:: images/servermode.png - :name: servermode + :name: servermode **Dragon Server Mode** @@ -350,7 +350,7 @@ Dragon run-time services. .. figure:: images/server.srms1.png :scale: 75% - :name: passthru-message-ex + :name: passthru-message-ex **PassThru Message Exchange** @@ -431,7 +431,7 @@ State Transitions ------------------- .. figure:: images/launcherstates.png - :name: launcherstates + :name: launcherstates **State Diagram** @@ -496,7 +496,7 @@ functionality is in the implementation of a Jupyter Notebook kernel that runs within the Dragon run-time services. .. figure:: images/jupytermode.png - :name: jupytermode + :name: jupytermode **Dragon Server Mode for Jupyter Notebooks** diff --git a/doc/services/local_services.rst b/doc/services/local_services.rst index 9654ca9..a5ff1cf 100644 --- a/doc/services/local_services.rst +++ b/doc/services/local_services.rst @@ -45,12 +45,12 @@ Architecture ============ .. figure:: images/shepherd.svg - :name: shepherd + :name: shepherd **Internal Shepherd Structure** .. figure:: images/shepherdstructure.png - :name: shepherdstructure + :name: shepherdstructure **Internal Shepherd Structure** @@ -121,7 +121,7 @@ Process Management ================== .. figure:: images/processstates.png - :name: processstates + :name: processstates **Process State Transition Diagram** @@ -154,7 +154,7 @@ all notifications about output on standard output and error, while the Global Se notification of the termination of the process. .. figure:: images/managedservices.png - :name: managedservices + :name: managedservices **Managed Process services provided by Local Services** @@ -190,7 +190,7 @@ The Local Services/Global Services Integration ======================================== .. figure:: images/gsmonitor.png - :name: gsmonitor + :name: gsmonitor **The Global Services Monitor** diff --git a/doc/start/start.rst b/doc/start/start.rst index 1be9820..9c47952 100644 --- a/doc/start/start.rst +++ b/doc/start/start.rst @@ -17,7 +17,9 @@ You need to have the following software packages installed on your system: - Python 3.9, 3.10, or 3.11 corresponding to your whl file (e.g., module load cray-python) - GCC 9 or later -- Slurm or PBS+PALS (for multi-node Dragon) +- Slurm or PBS+PALS (for multi-node Dragon on a super computer) OR +- A cluster with configured passwordless ssh keys and an MPI-like hostfile (to run multi-node + on a cluster) Download Dragon =================== @@ -28,50 +30,65 @@ Install Dragon =================== Before you can run programs using Dragon, you must set up the run-time for your -environment. You must have Python 3.9 installed and it must be in your path -somewhere. A common choice is to use a Python virtual environment, which can be initialized -for example from a base Python 3.9+ with: +environment. The untarred distribution file contains several subdirectories. All +provided commands are relative to the directory that contains the README.md. The +`dragon-*.whl` file must be pip3 installed once for your environment. The +`capnp-*.whl` file is also required. Some setup may be required to use module +support to load module files which set a few environment variables. The steps +are outlined for you in the rest of this section. + +You must have Python 3.9 installed and it must be in your path somewhere. + +A common choice for running Python programs is to use a Python virtual +environment. An install script is supplied in the distribution that performs the +install step(s) for you and creates and activates a virtual environment. You will +find this install script in the untarred distribution file at the root level. .. code-block:: console - python3 -m venv --clear _env - . _env/bin/activate + ./dragon-install -The untarred distribution file contains several subdirectories. All provided commands -are relative to the directory that contains the README.md. +You have completed the prerequisites for running Dragon with multiprocessing programs. -* The `dragon-*.whl` file must be pip3 installed once for your environment. +If there was an error about loading modules, then you need to enable module loading. In +that case, see the subsection below on *Enabling Module Support*. + +If you have already installed and want to come back and use your install at a later +time you may have to reactivate your environment. Execute this from the same directory. .. code-block:: console - pip3 install --force-reinstall dragon-0.8-*.whl + . _env/bin/activate -* Check and possibly update that `$PATH` is has the location of pip installed - console scripts, such as ~/.local/bin if you're not using a virtual environment. +Along with reactivating your environment you will also need to load the dragon +module. .. code-block:: console - export PATH=~/.local/bin:${PATH} + module use $PWD/modulefiles + module load dragon -* You must set up the environment by loading the dragon module as follows. +If you are NOT using a virtual environment then check and possibly update the +`$PATH` so it has the location of pip installed console scripts, such as +~/.local/bin. If using a virtual environment, this step is not necessary. .. code-block:: console - module use [/path to dragon-0.8]/modulefiles - module load dragon + export PATH=~/.local/bin:${PATH} -If you intend to use Dragon on your own Linux VM or an image that you -personally installed, you may need to enable module commands by adding the -following command to your ~/.bashrc or other login script. +Enabling Module Support +-------------------------- + +If you intend to use Dragon on your own Linux VM or an image that you personally +installed, you may need to enable module commands first by adding the following +command to your ~/.bashrc or other login script. .. code-block:: console source /usr/share/modules/init/bash -If you use a different shell, look in the `init` directory for a script for -your shell. - -You have completed the prerequisites for running Dragon with multiprocessing programs. +If you use a different shell, look in the `init` directory for a script for your +shell. Running Dragon ============== diff --git a/doc/uguide/glossary.rst b/doc/uguide/glossary.rst index 610eb69..94ee326 100644 --- a/doc/uguide/glossary.rst +++ b/doc/uguide/glossary.rst @@ -3,7 +3,7 @@ Glossary .. figure:: images/dragon_domain_model.svg :scale: 75% - :name: dragon-domain-model + :name: dragon-domain-model **UML diagram of the most important Dragon concepts and their relation. Open arrows are read as "is a", diamond edges as "contains", normal arrows are annotated** diff --git a/doc/uguide/intro.rst b/doc/uguide/intro.rst index ca9b71b..87cfc96 100644 --- a/doc/uguide/intro.rst +++ b/doc/uguide/intro.rst @@ -16,7 +16,7 @@ independently of where processes and Dragon resources are placed. .. figure:: images/overview_queue_doc.jpg :align: center :scale: 25% - :name: overview-queue-doc + :name: overview-queue-doc **Dragon Object Location Transparency** diff --git a/doc/uguide/resource_model.rst b/doc/uguide/resource_model.rst index 7fef41c..9641cf5 100644 --- a/doc/uguide/resource_model.rst +++ b/doc/uguide/resource_model.rst @@ -50,7 +50,7 @@ Object Hierarchy .. figure:: images/dragon_object_hierarchy.png :scale: 15% - :name: dragon-obj-hierarchy + :name: dragon-obj-hierarchy **A representation of the Dragon object hierarchy across Dragon Native API and Client API. Not all derived objects are shown.** diff --git a/examples/dragon_core/performance/ch_p2p_common.c b/examples/dragon_core/performance/ch_p2p_common.c index da75134..669a960 100644 --- a/examples/dragon_core/performance/ch_p2p_common.c +++ b/examples/dragon_core/performance/ch_p2p_common.c @@ -30,7 +30,6 @@ attach_to_memory_pool(char *b64_mpool_data, dragonMemoryPoolDescr_t *pdragon_mpo DEBUG_PRINT(("Decoding memory pool's serialized descriptor\n")); dragon_mpool_serial.data = dragon_base64_decode( b64_mpool_data, - strlen(b64_mpool_data), &dragon_mpool_serial.len); DEBUG_PRINT(("Attaching to memory pool\n")); diff --git a/examples/dragon_core/ringproc.c b/examples/dragon_core/ringproc.c index 2c8f6ed..8581370 100644 --- a/examples/dragon_core/ringproc.c +++ b/examples/dragon_core/ringproc.c @@ -34,7 +34,6 @@ int main(int argc, char* argv[]) { dragonMessage_t msg; char* send_ser_encoded; char* final_ser_encoded; - size_t send_ser_len; /* This function is necessary for off-node communication and relies on the * Dragon run-time services to supply gateway channels in the @@ -66,7 +65,7 @@ int main(int argc, char* argv[]) { * Dragon provides both base64 encoding and decoding for * interoperability between languages. */ - recv_chser.data = dragon_base64_decode(argv[3], strlen(argv[3]), &recv_chser.len); + recv_chser.data = dragon_base64_decode(argv[3], &recv_chser.len); /* With a valid serialized descriptor you can attach to a channel. This * attach here occurs on an off-node channel (except in the one node @@ -151,7 +150,7 @@ int main(int argc, char* argv[]) { return -1; } - send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len, &send_ser_len); + send_ser_encoded = dragon_base64_encode(send_chser.data, send_chser.len); err = dragon_memory_pool_detach(&pool_descr); if (err != DRAGON_SUCCESS) { @@ -174,7 +173,7 @@ int main(int argc, char* argv[]) { send_ser_encoded = argv[4]; final_ser_encoded = argv[5]; - send_chser.data = dragon_base64_decode(send_ser_encoded, strlen(send_ser_encoded), &send_chser.len); + send_chser.data = dragon_base64_decode(send_ser_encoded, &send_chser.len); err = dragon_channel_attach(&send_chser, &send_ch); if (err != DRAGON_SUCCESS) { @@ -189,7 +188,7 @@ int main(int argc, char* argv[]) { return -1; } - final_chser.data = dragon_base64_decode(final_ser_encoded, strlen(final_ser_encoded), &final_chser.len); + final_chser.data = dragon_base64_decode(final_ser_encoded, &final_chser.len); err = dragon_channel_attach(&final_chser, &final_ch); if (err != DRAGON_SUCCESS) { diff --git a/examples/dragon_data/ddict/ddict_bench.py b/examples/dragon_data/ddict/ddict_bench.py new file mode 100644 index 0000000..3040fde --- /dev/null +++ b/examples/dragon_data/ddict/ddict_bench.py @@ -0,0 +1,233 @@ +import enum +import time +import dragon +import multiprocessing as mp +import argparse +import string +import random +import traceback +import sys + +from dragon.infrastructure import parameters as dparm +from dragon.data.ddict.ddict import DDict + +@enum.unique +class DictOp(enum.Enum): + """Action to be performed on the dictionary item""" + SET_ITEM = enum.auto() + GET_ITEM = enum.auto() + DEL_ITEM = enum.auto() + +def do_dict_ops(keys, ddict, client_id, iterations, msg_size, result_link, dict_op): + """Function used to execute operations on the shared Dragon dictionary + :param keys: List of all the keys of the dictionary + :type keys: list + :param ddict: A dragon dictionary object + :type ddict: dragon dictionary + :param client_id: Unique ID of the client + :type client_id: int + :param iterations: Number of iterations to perform a dictionary operation + :type iterations: int + :param msg_size: Number of characters used for the length of the value + :type msg_size: int + :param result_link: A pipe used for the communication of the results + :type result_link: connection object + :param dict_op: Enum that controls the operations of the dictionary + :type dict_op: enum + """ + try: + if dict_op == DictOp.SET_ITEM or dict_op == DictOp.DEL_ITEM: + letters = string.ascii_letters + value = ''.join(random.choice(letters) for i in range(msg_size)) + + if dict_op == DictOp.SET_ITEM: + print(f'CLIENT: Started Set Item Operations {dparm.this_process.my_puid=}', flush=True) + start = time.monotonic() + for i in range(iterations): + key = random.choice(keys) + ddict[key] = value + end = time.monotonic() + elif dict_op == DictOp.GET_ITEM: + start = time.monotonic() + for i in range(iterations): + key = random.choice(keys) + val = ddict[key] + end = time.monotonic() + elif dict_op == DictOp.DEL_ITEM: + start = time.monotonic() + for i in range(iterations): + # key = random.choice(keys) + key = keys[i] + del ddict[key] + ddict[key] = value + end = time.monotonic() + + result_link.send((start, end)) + if client_id == 0: + print(f"DictOp {dict_op.value}: I am client {client_id}. (start): {start} -- (end): {end}. (end - start): {end - start}. Elapsed time: {(end - start) / iterations} sec", flush=True) + + ddict.detach() + except Exception as e: + tb = traceback.format_exc() + print(f'There was an exception in do_dict_ops: {e} \n Traceback: \n {tb}', flush=True) + +def generate_keys(dict_size=100): + """Generate a list including the keys that will be used for the dictionary. + :param dict_size: Total number of keys to be populated in the dicitonary + :type dict_size: int + :return: List of keys to be stored along with values in the dictionary + :rtype: list + """ + my_keys = list() + letters = string.ascii_letters + + for _ in range(dict_size): + # each key is 30 characters long + # key = ''.join(random.choice(letters) for i in range(30)) # characters can be repeated + key = ''.join(random.choice(letters) for i in range(8)) # characters can be repeated + my_keys.append(key) + + assert len(my_keys) == dict_size + return my_keys + + +def assign_keys(ddict, keys, value_size): + """Initiate the dictionary. Assign the values to each key in the provided list. + Each value is a string of msg_size characters long. + :param ddict: A dragon dictionary object + :type ddict: dragon dictionary + :param keys: List of keys to assign values in the dictionary + :type keys: list + :param value_size: Number of characters used for the length of the value + :type value_size: int + """ + for key in keys: + letters = string.ascii_letters + value = ''.join(random.choice(letters) for i in range(value_size)) + ddict[key] = value + + ddict.detach() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Distributed dictionary benchmark') + parser.add_argument('--dragon', action='store_true', help='run using dragon') + parser.add_argument('--dict_size', type=int, default=500, + help='Number of (key,value) pairs inside the dict') + parser.add_argument('--value_size', type=int, default=1024, + help='size of the value (bytes) that are stored in the dict') + parser.add_argument('--num_nodes', type=int, default=1, + help='number of nodes the dictionary distributed across') + parser.add_argument('--clients', type=int, default=10, + help='number of client processes performing operations on the dict') + parser.add_argument('--managers_per_node', type=int, default=1, + help='number of managers per node for the dragon dict') + parser.add_argument('--total_mem_size', type=float, default=1, + help='total managed memory size for dictionary in GB') + parser.add_argument('--batch_size', type=int, default=100, + help='number of kv pairs added by each process before dict operations') + parser.add_argument('--iterations', type=int, default=1000, + help='number of iterations') + parser.add_argument('--dict_ops', type=int, default=0, + help='choose the operations to be performed on the dict -- ' + '0 to set the values, ' + '1 to get the values, ' + '2 for both, ' + '3 for deletes (includes setting the values for now to continue further deletes)') + + my_args = parser.parse_args() + + mp.set_start_method("dragon") + total_mem_size = int(my_args.total_mem_size * (1024*1024*1024)) + print(f'{total_mem_size=}', flush=True) + ddict = DDict(my_args.managers_per_node, my_args.num_nodes, total_mem_size) + + num_clients = my_args.clients + value_size = my_args.value_size + + if my_args.dict_ops == 3: + dict_size = my_args.clients * my_args.iterations + else: + dict_size = my_args.dict_size + all_keys = generate_keys(dict_size=dict_size) + # Parallelize the initialization with batch size of key/value entries + num_keys = len(all_keys) + batch_size = my_args.batch_size + num_batches = num_keys // batch_size + int(num_keys % batch_size != 0) + print(f'{num_batches=}', file=sys.stderr, flush=True) + jobs = [] + for i in range(num_batches): + if (i == num_batches - 1) and (num_keys % batch_size != 0): + batch_keys = all_keys[i * batch_size:] + else: + batch_keys = all_keys[i * batch_size:(i+1) * batch_size] + jobs.append(mp.Process(target=assign_keys, args=(ddict, batch_keys, value_size))) + + # Complete the initialization of the dictionary + _ = [p.start() for p in jobs] + _ = [p.join() for p in jobs] + _ = [p.terminate() for p in jobs] # make sure we clean everything + + length = len(ddict) + print(f'Length of the dictionary is {length}', flush=True) + + dict_ops = [] + if my_args.dict_ops == 0: + dict_ops.append(DictOp.SET_ITEM) + elif my_args.dict_ops == 1: + dict_ops.append(DictOp.GET_ITEM) + elif my_args.dict_ops == 2: + dict_ops.append(DictOp.SET_ITEM) + dict_ops.append(DictOp.GET_ITEM) + elif my_args.dict_ops == 3: + dict_ops.append(DictOp.DEL_ITEM) + + + for ii in range(len(dict_ops)): + result_links = [mp.Pipe(duplex=False) for _ in range(num_clients)] + try: + procs = [] + if dict_ops[ii] == DictOp.DEL_ITEM: + for i in range(num_clients): + print(f'{i*my_args.iterations}:{(i+1)*my_args.iterations}', flush=True) + client_proc = mp.Process(target=do_dict_ops, + args=(all_keys[i*my_args.iterations:(i+1)*my_args.iterations], ddict, i, my_args.iterations, + value_size, result_links[i][1], dict_ops[ii],)) + client_proc.start() + print(f'{client_proc=}', flush=True) + procs.append(client_proc) + else: + for i in range(num_clients): + client_proc = mp.Process(target=do_dict_ops, + args=(all_keys, ddict, i, my_args.iterations, + value_size, result_links[i][1], dict_ops[ii],)) + client_proc.start() + print(f'{client_proc=}', flush=True) + procs.append(client_proc) + + # min_start = 1.0e9 + # since we have joined the processes, we know this value will be greater than the processes' corresponding values + min_start = time.monotonic() + max_end = 0.0 + for i in range(num_clients): + start, end = result_links[i][0].recv() + min_start = min(min_start, start) + max_end = max(max_end, end) + + for i in range(len(procs)): + procs[i].join() + + result = (max_end - min_start) / my_args.iterations + rate = (my_args.iterations * num_clients) / (max_end - min_start) # aggregated rate + + print(f"\n{dict_ops[ii]}:", flush=True) + print(f"Msglen [B] Lat [sec]\n{value_size} {result}", flush=True) + print(f"Msglen [B] Rate\n{value_size} {rate}\n ", flush=True) + + for i in range(len(procs)): + procs[i].kill() + + except Exception as e: + tb = traceback.format_exc() + print(f'There was an exception in ddict_bench: {e} \n Traeback: \n {tb}', flush=True) + + ddict.destroy() \ No newline at end of file diff --git a/examples/dragon_data/ddict/ddict_bench_pg.py b/examples/dragon_data/ddict/ddict_bench_pg.py new file mode 100644 index 0000000..5e32b08 --- /dev/null +++ b/examples/dragon_data/ddict/ddict_bench_pg.py @@ -0,0 +1,237 @@ +import enum +import time +import dragon +import multiprocessing as mp +import argparse +import string +import random +import traceback +import sys +import os + +from dragon.infrastructure.parameters import this_process +from dragon.native.queue import Queue +from dragon.native.process_group import ProcessGroup +from dragon.native.process import ProcessTemplate +from dragon.data.ddict.ddict import DDict + +pid = os.getpid() + +@enum.unique +class DictOp(enum.Enum): + """Action to be performed on the dictionary item""" + SET_ITEM = enum.auto() + GET_ITEM = enum.auto() + DEL_ITEM = enum.auto() + +def do_dict_ops(keys, ddict, client_id, iterations, msg_size, result_link, dict_op): + """Function used to execute operations on the shared Dragon dictionary + :param keys: List of all the keys of the dictionary + :type keys: list + :param ddict: A dragon dictionary object + :type ddict: dragon dictionary + :param client_id: Unique ID of the client + :type client_id: int + :param iterations: Number of iterations to perform a dictionary operation + :type iterations: int + :param msg_size: Number of characters used for the length of the value + :type msg_size: int + :param result_link: A pipe used for the communication of the results + :type result_link: connection object + :param dict_op: Enum that controls the operations of the dictionary + :type dict_op: enum + """ + try: + if dict_op == DictOp.SET_ITEM or dict_op == DictOp.DEL_ITEM: + letters = string.ascii_letters + value = ''.join(random.choice(letters) for i in range(msg_size)) + + if dict_op == DictOp.SET_ITEM: + start = time.monotonic() + for i in range(iterations): + key = random.choice(keys) + try: + ddict[key] = value + except Exception as ex: + print(ex, file=sys.stderr, flush=True) + print('This client process is exiting due to the exception.', file=sys.stderr, flush=True) + q.put((i,i)) + ddict.detach() + return + end = time.monotonic() + elif dict_op == DictOp.GET_ITEM: + start = time.monotonic() + for i in range(iterations): + key = random.choice(keys) + val = ddict[key] + end = time.monotonic() + elif dict_op == DictOp.DEL_ITEM: + start = time.monotonic() + for i in range(iterations): + # key = random.choice(keys) + key = keys[i] + del ddict[key] + ddict[key] = value + end = time.monotonic() + + result_link.put((start, end)) + + ddict.detach() + except Exception as e: + tb = traceback.format_exc() + print(f'There was an exception in do_dict_ops: {e} \n Traceback: \n {tb}', flush=True, file=sys.stderr) + +def generate_keys(dict_size=100): + """Generate a list including the keys that will be used for the dictionary. + :param dict_size: Total number of keys to be populated in the dicitonary + :type dict_size: int + :return: List of keys to be stored along with values in the dictionary + :rtype: list + """ + my_keys = list() + letters = string.ascii_letters + + for _ in range(dict_size): + # each key is 30 characters long + # key = ''.join(random.choice(letters) for i in range(30)) # characters can be repeated + key = ''.join(random.choice(letters) for i in range(8)) # characters can be repeated + my_keys.append(key) + + assert len(my_keys) == dict_size + return my_keys + + +def assign_keys(ddict, keys, value_size): + """Initiate the dictionary. Assign the values to each key in the provided list. + Each value is a string of msg_size characters long. + :param ddict: A dragon dictionary object + :type ddict: dragon dictionary + :param keys: List of keys to assign values in the dictionary + :type keys: list + :param value_size: Number of characters used for the length of the value + :type value_size: int + """ + for key in keys: + letters = string.ascii_letters + value = ''.join(random.choice(letters) for i in range(value_size)) + ddict[key] = value + + ddict.detach() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Distributed dictionary benchmark') + parser.add_argument('--dragon', action='store_true', help='run using dragon') + parser.add_argument('--dict_size', type=int, default=500, + help='Number of (key,value) pairs inside the dict') + parser.add_argument('--value_size', type=int, default=1024, + help='size of the value (bytes) that are stored in the dict') + parser.add_argument('--num_nodes', type=int, default=1, + help='number of nodes the dictionary distributed across') + parser.add_argument('--clients', type=int, default=10, + help='number of client processes performing operations on the dict') + parser.add_argument('--managers_per_node', type=int, default=1, + help='number of managers per node for the dragon dict') + parser.add_argument('--total_mem_size', type=float, default=1, + help='total managed memory size for dictionary in GB') + parser.add_argument('--batch_size', type=int, default=100, + help='number of kv pairs added by each process before dict operations') + parser.add_argument('--iterations', type=int, default=1000, + help='number of iterations') + parser.add_argument('--dict_ops', type=int, default=0, + help='choose the operations to be performed on the dict -- ' + '0 to set the values, ' + '1 to get the values, ' + '2 for both, ' + '3 for deletes (includes setting the values for now to continue further deletes)') + + my_args = parser.parse_args() + + mp.set_start_method("dragon") + total_mem_size = int(my_args.total_mem_size * (1024*1024*1024)) + ddict = DDict(my_args.managers_per_node, my_args.num_nodes, total_mem_size, timeout=3000) + + num_clients = my_args.clients + value_size = my_args.value_size + + if my_args.dict_ops == 3: + dict_size = my_args.clients * my_args.iterations + else: + dict_size = my_args.dict_size + all_keys = generate_keys(dict_size=dict_size) + # Parallelize the initialization with batch size of key/value entries + num_keys = len(all_keys) + batch_size = my_args.batch_size + num_batches = num_keys // batch_size + int(num_keys % batch_size != 0) + print(f'{num_batches=}', file=sys.stderr, flush=True) + jobs = [] + + if num_batches > 0: + grp = ProcessGroup(restart=False) + for i in range(num_batches): + if (i == num_batches - 1) and (num_keys % batch_size != 0): + batch_keys = all_keys[i * batch_size:] + else: + batch_keys = all_keys[i * batch_size:(i+1) * batch_size] + + grp.add_process(nproc=1, template=ProcessTemplate(target=assign_keys, args=(ddict, batch_keys, value_size))) + + # initialization of the dictionary + grp.init() + grp.start() + grp.join() + grp.stop() + + length = len(ddict) + print(f'Length of the dictionary is {length}', flush=True) + + dict_ops = [] + if my_args.dict_ops == 0: + dict_ops.append(DictOp.SET_ITEM) + elif my_args.dict_ops == 1: + dict_ops.append(DictOp.GET_ITEM) + elif my_args.dict_ops == 2: + dict_ops.append(DictOp.SET_ITEM) + dict_ops.append(DictOp.GET_ITEM) + elif my_args.dict_ops == 3: + dict_ops.append(DictOp.DEL_ITEM) + + for ii in range(len(dict_ops)): + try: + grp = ProcessGroup(restart=False) + q = Queue() + if dict_ops[ii] == DictOp.DEL_ITEM: + for i in range(num_clients): + grp.add_process(nproc=1, template= + ProcessTemplate(target=do_dict_ops, args=(all_keys[i*my_args.iterations:(i+1)*my_args.iterations], ddict, i, my_args.iterations, + value_size, q, dict_ops[ii]))) + else: + grp.add_process(nproc=num_clients, template= + ProcessTemplate(target=do_dict_ops, args=(all_keys, ddict, 100, my_args.iterations, value_size, q, dict_ops[ii]))) + + grp.init() + grp.start() + + min_start = time.monotonic() + max_end = 0.0 + for i in range(num_clients): + start, end = q.get() + print(f"Iteration {i}: Received results from client with value {start=} and {end=}", flush=True) + min_start = min(min_start, start) + max_end = max(max_end, end) + + result = (max_end - min_start) / max(1, my_args.iterations) + rate = (my_args.iterations * num_clients) / max(1, max_end - min_start) # aggregated rate + + print(f"Msglen [B] Lat [sec]\n{value_size} {result}", flush=True) + print(f"Msglen [B] Rate\n{value_size} {rate}\n ", flush=True) + + grp.join() + grp.stop() + + + except Exception as ex: + tb = traceback.format_exc() + print(f'There was an exception in ddict_bench: {ex} \n Traceback: \n {tb}', flush=True) + + + ddict.destroy() diff --git a/examples/dragon_data/ddict/demo_ddict.py b/examples/dragon_data/ddict/demo_ddict.py new file mode 100644 index 0000000..3c51756 --- /dev/null +++ b/examples/dragon_data/ddict/demo_ddict.py @@ -0,0 +1,45 @@ +import dragon +from dragon.data.ddict.ddict import DDict +import multiprocessing as mp + +def client_function(d, client_id): + + key1 = 'hello' + str(client_id) + d[key1] = 'world' + str(client_id) + print(f'added {key1} to dictionary') + +def main(): + """ + Test put and get functions. + """ + # bring up dictionary + # number of manager = 2 + # number of nodes = 1 + # total size of dictionary = 2000000 + # clients and managers will be on different node in round-robin fashion + d = DDict(2,1,2000000) + # create 10 clients, each of them implements client_function + procs = [] + for i in range(10): + proc = mp.Process(target=client_function, args=(d, i)) + procs.append(proc) + proc.start() + + # waiting for all client process to finish + for i in range(10): + procs[i].join() + # lookup all keys and vals + try: + for i in range(10): + key = 'hello' + str(i) + val = d[key] + print(f'd[{repr(key)}] = {repr(val)}') + # print out all key and value given the key + assert val == 'world' + str(i) + except Exception as e: + print(f'Got exception {repr(e)}') + # destroy dictionary + d.destroy() + +if __name__ == "__main__": + main() diff --git a/examples/dragon_data/ddict/demo_ddict_pool.py b/examples/dragon_data/ddict/demo_ddict_pool.py new file mode 100644 index 0000000..9fe12dc --- /dev/null +++ b/examples/dragon_data/ddict/demo_ddict_pool.py @@ -0,0 +1,43 @@ +import dragon +import multiprocessing as mp +from dragon.data.ddict.ddict import DDict +import os + +def client_function(distdict, client_id): + + key1 = str(os.getpid()) + 'hello' + str(client_id) + distdict[key1] = 'world' + str(client_id) + print(f'added {key1} to dictionary') + return key1 + +def main(): + mp.set_start_method('dragon') + + d = dict() + + d['Miska'] = 'Dog' + d['Tigger'] = 'Cat' + d[123] = 456 + d['function'] = client_function + + distdict = DDict(1,10,10000000) + distdict['Miska'] = 'Dog' + distdict['Tigger'] = 'Cat' + distdict[123] = 456 + distdict['function'] = client_function + + with mp.Pool(5) as p: + keys = p.starmap(client_function, [(distdict, client_id) for client_id in range(64)]) + + print(keys) + + for key in keys: + print(f'distdict[{repr(key)}] is mapped to {repr(distdict[key])}') + + distdict.destroy() + +if __name__ == "__main__": + main() + + + diff --git a/examples/dragon_data/dist_dict_bench.py b/examples/dragon_data/dist_dict_bench.py index 9da6363..5bd38a2 100644 --- a/examples/dragon_data/dist_dict_bench.py +++ b/examples/dragon_data/dist_dict_bench.py @@ -17,7 +17,6 @@ class DictOp(enum.Enum): def do_dict_ops(_keys, _dict, client_id, iterations, msg_size, result_link, dict_op): """Function used to execute operations on the shared Dragon dictionary - :param _keys: List of all the keys of the dictionary :type _keys: list :param _dict: A dragon dictionary object @@ -67,7 +66,6 @@ def do_dict_ops(_keys, _dict, client_id, iterations, msg_size, result_link, dict def generate_keys(dict_size=100): """Generate a list including the keys that will be used for the dictionary. - :param dict_size: Total number of keys to be populated in the dicitonary :type dict_size: int :return: List of keys to be stored along with values in the dictionary @@ -77,8 +75,9 @@ def generate_keys(dict_size=100): letters = string.ascii_letters for _ in range(dict_size): - # each key is 30 characters long - key = ''.join(random.choice(letters) for i in range(30)) # characters can be repeated + # each key is 8 characters long + key = ''.join(random.choice(letters) for i in range(8)) # characters can be repeated + my_keys.append(key) assert len(my_keys) == dict_size @@ -88,7 +87,6 @@ def generate_keys(dict_size=100): def assign_keys(_dict, keys, value_size): """Initiate the dictionary. Assign the values to each key in the provided list. Each value is a string of msg_size characters long. - :param _dict: A dragon dictionary object :type _dict: dragon dictionary :param keys: List of keys to assign values in the dictionary @@ -188,6 +186,7 @@ def assign_keys(_dict, keys, value_size): for ii in range(len(dict_ops)): result_links = [mp.Pipe(duplex=False) for _ in range(num_clients)] + print(f'======================= start client operations ==============================') try: procs = [] for i in range(num_clients): @@ -225,4 +224,4 @@ def assign_keys(_dict, keys, value_size): print(e) if my_args.dragon: - dd.stop() + dd.stop() \ No newline at end of file diff --git a/examples/dragon_gs_client/Makefile b/examples/dragon_gs_client/Makefile new file mode 100644 index 0000000..02ea20b --- /dev/null +++ b/examples/dragon_gs_client/Makefile @@ -0,0 +1,20 @@ +CC=gcc + +CFLAGS = -g -pedantic -Wall -I ${CRAY_MPICH_DIR}/include -L ${CRAY_MPICH_DIR}/lib +LD_FLAGS = -lm -L ${CRAY_MPICH_DIR}/lib -lmpich +H_SOURCES = + +MPI_EXE=mpi_hello +MPI_SRC=mpi_hello.c +MPI_OBJECT = $(MPI_SRC:.c=.c.o) + +default: $(MPI_EXE) + +%.c.o: %.c $(H_SOURCES) + $(CC) $(CFLAGS) -c $< -o $@ + +$(MPI_EXE): $(MPI_OBJECT) + $(CC) $(LD_FLAGS) $^ -o $@ + +clean: + $(RM) $(MPI_EXE) $(MPI_OBJECT) diff --git a/examples/dragon_gs_client/README.md b/examples/dragon_gs_client/README.md index d4c42d8..cdf5570 100644 --- a/examples/dragon_gs_client/README.md +++ b/examples/dragon_gs_client/README.md @@ -77,3 +77,62 @@ Completed message with all pids: [12844, 12845, 12846, 12847] ``` This example is not designed to work multi-node. + +## Group API Example + +The Dragon Group API Example efficiently starts a group of Python processes. + +How to run the example: + +`dragon dragon_group_demo.py` + +Expected output: + +``` +Hello from x1000c0s2b0n0 +Hello from x1000c0s2b0n0 +Hello from x1000c0s2b0n1 +Hello from x1000c0s2b0n1 +``` + +## Group API MPI Example + +In this example, the Dragon Group API is used to start the mpi_hello MPI application. + +How to run the example: + +``` +make +dragon dragon_group_mpi_demo.py +``` + +Expected Output: + +``` +> make +gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib -c mpi_hello.c -o mpi_hello.c.o +gcc -lm -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib -lmpich mpi_hello.c.o -o mpi_hello + +> dragon dragon_group_mpi_demo.py +Hello world from pid 180513, processor x1000c0s2b0n0, rank 1 out of 4 processors +Hello world from pid 179427, processor x1000c0s2b0n1, rank 0 out of 4 processors +Hello world from pid 180514, processor x1000c0s2b0n0, rank 3 out of 4 processors +Hello world from pid 179428, processor x1000c0s2b0n1, rank 2 out of 4 processors +``` + +## Group Create Add To API Example + +The Dragon Group API Example efficiently starts a group of Python processes. + +How to run the example: + +`dragon dragon_group_create_addto_demo.py` + +Expected output: + +``` +Hello from x1000c0s2b0n0 +Hello from x1000c0s2b0n0 +Hello2 from x1002c0s1b1n0 +Hello2 from x1002c0s3b0n0 +``` \ No newline at end of file diff --git a/examples/dragon_gs_client/dragon_group_create_addto_demo.py b/examples/dragon_gs_client/dragon_group_create_addto_demo.py new file mode 100644 index 0000000..17b598a --- /dev/null +++ b/examples/dragon_gs_client/dragon_group_create_addto_demo.py @@ -0,0 +1,103 @@ +import os +import sys +import cloudpickle + +import dragon + +from dragon.globalservices import node +from dragon.globalservices import group +from dragon.globalservices import process +from dragon.globalservices import policy_eval +from dragon.infrastructure import process_desc +from dragon.utils import host_id + + +def hello(): + my_host_id = host_id() + my_node = node.query(my_host_id) + print(f'Hello from {my_node.name}', flush=True) + + +def hello2(): + my_host_id = host_id() + my_node = node.query(my_host_id) + print(f'Hello2 from {my_node.name}', flush=True) + + +def get_python_process_parameters(target, args, kwargs) -> tuple: + + new_target = sys.executable + new_args = [ + "-c", + "from dragon.native.process import _dragon_native_python_process_main; _dragon_native_python_process_main()", + ] + argdata = cloudpickle.dumps((target, args or (), kwargs or {})) + return new_target, new_args, argdata + + +def main() -> None: + run_dir = os.getcwd() + + target, args, argdata = get_python_process_parameters(target=hello, args=None, kwargs=None) + + # Pipe the stdout output from the head process to a Dragon connection + process_create_msg = process.get_create_message_with_argdata( + exe=target, + run_dir=run_dir, + args=args, + argdata=argdata, + pmi_required=False, + env=None, + ) + + num_processes = node.query_total_cpus() // 4 + print(f'Starting {num_processes} processes', flush=True) + + # Establish the list and number of process ranks that should be started + items = [ + (num_processes, process_create_msg.serialize()), + ] + + # Ask Dragon to create the process group + grp = group.create(items=items, policy=policy_eval.Policy(), soft=False) + + #======== + + target, args, argdata = get_python_process_parameters(target=hello2, args=None, kwargs=None) + + # Pipe the stdout output from the head process to a Dragon connection + process_create_msg = process.get_create_message_with_argdata( + exe=target, + run_dir=run_dir, + args=args, + argdata=argdata, + pmi_required=False, + env=None, + ) + + print(f'Creating and adding {num_processes} processes', flush=True) + + # Establish the list and number of process ranks that should be started + items = [ + (num_processes, process_create_msg.serialize()), + ] + + # Ask Dragon to create the process group + grp = group.create_add_to(grp.g_uid, items=items, policy=policy_eval.Policy()) + + group_puids = [] + for resources in grp.sets: + group_puids.extend( + [ + resource.desc.p_uid + for resource in resources + if resource.desc.state == process_desc.ProcessDescriptor.State.ACTIVE + ] + ) + + if len(group_puids) > 0: + process.multi_join(group_puids, join_all=True) + + +if __name__ == "__main__": + main() diff --git a/examples/dragon_gs_client/dragon_group_demo.py b/examples/dragon_gs_client/dragon_group_demo.py new file mode 100644 index 0000000..090c577 --- /dev/null +++ b/examples/dragon_gs_client/dragon_group_demo.py @@ -0,0 +1,72 @@ +import os +import sys +import cloudpickle + +import dragon + +from dragon.globalservices import node +from dragon.globalservices import group +from dragon.globalservices import process +from dragon.globalservices import policy_eval +from dragon.infrastructure import process_desc +from dragon.utils import host_id + + +def hello(): + my_host_id = host_id() + my_node = node.query(my_host_id) + print(f'Hello from {my_node.name}', flush=True) + + +def get_python_process_parameters(target, args, kwargs) -> tuple: + + new_target = sys.executable + new_args = [ + "-c", + "from dragon.native.process import _dragon_native_python_process_main; _dragon_native_python_process_main()", + ] + argdata = cloudpickle.dumps((target, args or (), kwargs or {})) + return new_target, new_args, argdata + + +def main() -> None: + run_dir = os.getcwd() + + target, args, argdata = get_python_process_parameters(target=hello, args=None, kwargs=None) + + # Pipe the stdout output from the head process to a Dragon connection + process_create_msg = process.get_create_message_with_argdata( + exe=target, + run_dir=run_dir, + args=args, + argdata=argdata, + pmi_required=False, + env=None, + ) + + num_processes = node.query_total_cpus() // 2 + print(f'Starting {num_processes} processes', flush=True) + + # Establish the list and number of process ranks that should be started + items = [ + (num_processes, process_create_msg.serialize()), + ] + + # Ask Dragon to create the process group + grp = group.create(items=items, policy=policy_eval.Policy(), soft=False) + + group_puids = [] + for resources in grp.sets: + group_puids.extend( + [ + resource.desc.p_uid + for resource in resources + if resource.desc.state == process_desc.ProcessDescriptor.State.ACTIVE + ] + ) + if len(group_puids) > 0: + process.multi_join(group_puids, join_all=True) + + +if __name__ == "__main__": + main() diff --git a/examples/dragon_gs_client/dragon_group_mpi_demo.py b/examples/dragon_gs_client/dragon_group_mpi_demo.py new file mode 100644 index 0000000..efba27e --- /dev/null +++ b/examples/dragon_gs_client/dragon_group_mpi_demo.py @@ -0,0 +1,48 @@ +import os + +import dragon + +from dragon.globalservices import node +from dragon.globalservices import group +from dragon.globalservices import process +from dragon.globalservices import policy_eval +from dragon.infrastructure import process_desc + + +def main() -> None: + run_dir = os.getcwd() + + process_create_msg = process.get_create_message( + exe=os.path.join(run_dir, "mpi_hello"), + run_dir=run_dir, + args=[], + pmi_required=True, + env=None, + ) + + # num_processes = 4 + num_processes = node.query_total_cpus() // 2 + print(f'Starting {num_processes} processes', flush=True) + + # Establish the list and number of process ranks that should be started + create_items = [ + (num_processes, process_create_msg.serialize()), + ] + + # Ask Dragon to create the process group + grp = group.create(items=create_items, policy=policy_eval.Policy(), soft=False) + + group_puids = [] + for resources in grp.sets: + group_puids.extend( + [ + resource.desc.p_uid + for resource in resources + if resource.desc.state == process_desc.ProcessDescriptor.State.ACTIVE + ] + ) + if len(group_puids) > 0: + process.multi_join(group_puids, join_all=True) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/dragon_gs_client/mpi_hello.c b/examples/dragon_gs_client/mpi_hello.c new file mode 100644 index 0000000..7d198ec --- /dev/null +++ b/examples/dragon_gs_client/mpi_hello.c @@ -0,0 +1,32 @@ +#include +#include +#include +#include + + +int main(int argc, char** argv) { + + // Initialize the MPI environment + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + // Print off a hello world message + printf("Hello world from pid %d, processor %s, rank %d out of %d processors\n", + getpid(), processor_name, world_rank, world_size); + + // Finalize the MPI environment. + MPI_Finalize(); + +} diff --git a/examples/dragon_native/mpi/README.md b/examples/dragon_native/mpi/README.md index 61bf133..a9f7c0d 100644 --- a/examples/dragon_native/mpi/README.md +++ b/examples/dragon_native/mpi/README.md @@ -5,7 +5,7 @@ start MPI applications. ## MPI Pool Workers Demo -The file `mpi_pool_workers_demo.py` contains a program that starts a single rank +The file `mpi_process_group_demo.py` contains a program that starts a single rank MPI hello world application on each of the allocated workload manager nodes. As support for MPI applications is expanded, Dragon will support multiple ranks per node, partitioning of nodes, etc. @@ -22,13 +22,13 @@ node, partitioning of nodes, etc. ``` 3. Run the dragon example! ``` - > dragon mpi_pool_workers_demo.py + > dragon mpi_process_group_demo.py ``` ### Example Output ``` -login> dragon mpi_pool_workers_demo.py +login> dragon mpi_process_group_demo.py [stdout: p_uid=4294967297] Hello world from processor pinoak0202, rank 1 out of 2 processors [stdout: p_uid=4294967298] Hello world from processor pinoak0201, rank 0 out of 2 processors ``` @@ -86,3 +86,65 @@ INFO:consumer:{0: ('4', '62.30')} INFO:consumer:{0: ('8', '68.07')} ... ``` + +## Policy Demo + +The file `policy_demo.py` contains a program that shows how policies can be passed to process groups and processes that are a part of the process group. This example highlights how an MPI application can be launched on a subset of the allocated nodes and how a policy restricting the cpu affinity can be applied to the whole group. + +### Running the example + +1. Run `make` to build the mpi_hello example application. + ``` + > make + ``` +2. Get an allocation of nodes + ``` + > salloc --nodes=2 --exclusive + ``` +3. Run the dragon example! + ``` + > dragon policy_demo.py + ``` + +### Example Output + +``` + dragon policy_demo.py +Using 2 of 4 +pinoak0015 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] +pinoak0016 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] +pinoak0014 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] +pinoak0013 has AMD GPUs with visible devices: [0, 1, 2, 3, 4, 5, 6, 7] +4294967298 returned output: Hello world from pid 57645, processor pinoak0015, rank 0 out of 16 processors + +4294967299 returned output: Hello world from pid 57646, processor pinoak0015, rank 1 out of 16 processors + +4294967300 returned output: Hello world from pid 57647, processor pinoak0015, rank 2 out of 16 processors + +4294967301 returned output: Hello world from pid 57648, processor pinoak0015, rank 3 out of 16 processors + +4294967302 returned output: Hello world from pid 57649, processor pinoak0015, rank 4 out of 16 processors + +4294967303 returned output: Hello world from pid 57650, processor pinoak0015, rank 5 out of 16 processors + +4294967304 returned output: Hello world from pid 57651, processor pinoak0015, rank 6 out of 16 processors + +4294967305 returned output: Hello world from pid 57652, processor pinoak0015, rank 7 out of 16 processors + +4294967306 returned output: Hello world from pid 56247, processor pinoak0016, rank 8 out of 16 processors + +4294967307 returned output: Hello world from pid 56248, processor pinoak0016, rank 9 out of 16 processors + +4294967308 returned output: Hello world from pid 56249, processor pinoak0016, rank 10 out of 16 processors + +4294967309 returned output: Hello world from pid 56250, processor pinoak0016, rank 11 out of 16 processors + +4294967310 returned output: Hello world from pid 56251, processor pinoak0016, rank 12 out of 16 processors + +4294967311 returned output: Hello world from pid 56252, processor pinoak0016, rank 13 out of 16 processors + +4294967312 returned output: Hello world from pid 56253, processor pinoak0016, rank 14 out of 16 processors + +4294967313 returned output: Hello world from pid 56254, processor pinoak0016, rank 15 out of 16 processors +``` + diff --git a/examples/dragon_native/mpi/hpc_workflow_demo_highlevel.py b/examples/dragon_native/mpi/hpc_workflow_demo_highlevel.py index acf2ee5..7d745ad 100644 --- a/examples/dragon_native/mpi/hpc_workflow_demo_highlevel.py +++ b/examples/dragon_native/mpi/hpc_workflow_demo_highlevel.py @@ -9,7 +9,7 @@ from dragon.globalservices import node from dragon.globalservices.process import multi_join from dragon.infrastructure.connection import Connection -from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, TemplateProcess +from dragon.native.process import MSG_PIPE, MSG_DEVNULL, Process, ProcessTemplate from dragon.native.process_group import ProcessGroup logging.basicConfig(level=logging.INFO) @@ -43,13 +43,13 @@ def producer_proc(producer_id: int, num_ranks: int, result_queue: mp.Queue) -> N # Pipe the stdout output from the head process to a Dragon connection grp.add_process( nproc=1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE) + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_PIPE) ) # All other ranks should have their output go to DEVNULL grp.add_process( nproc=num_ranks-1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL) + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=MSG_DEVNULL) ) grp.init() diff --git a/examples/dragon_native/mpi/mpi_process_group_demo.py b/examples/dragon_native/mpi/mpi_process_group_demo.py new file mode 100644 index 0000000..60bb2ea --- /dev/null +++ b/examples/dragon_native/mpi/mpi_process_group_demo.py @@ -0,0 +1,37 @@ +""" +Run a sample MPI Hello World application with 1 rank per allocated node. +""" + +import os + +from dragon.native.process import ProcessTemplate +from dragon.native.process_group import ProcessGroup +from dragon.globalservices.node import get_list + + +def main(): + nnodes = len(get_list()) + mpi_hello_cmd = os.path.join(os.getcwd(), "mpi_hello") + args = [] + cwd = os.getcwd() + + if not nnodes: + print("No slurm allocation detected.") + os.exit(-1) + + pool = ProcessGroup(restart=False, pmi_enabled=True) + pool.add_process( + nproc=nnodes, + template=ProcessTemplate( + target=mpi_hello_cmd, args=args, cwd=cwd, env=None + ), + ) + pool.init() + pool.start() + pool.join() + + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/dragon_native/mpi/policy_demo.py b/examples/dragon_native/mpi/policy_demo.py new file mode 100644 index 0000000..c6dfbb8 --- /dev/null +++ b/examples/dragon_native/mpi/policy_demo.py @@ -0,0 +1,86 @@ + +from dragon.native.process import Process, ProcessTemplate, Popen +from dragon.native.process_group import ProcessGroup +from dragon.infrastructure.connection import Connection +from dragon.infrastructure.policy import Policy +from dragon.native.machine import System, Node + +import os + +def parse_results(stdout_conn: Connection) -> str: + """Read stdout from the Dragon connection. + + :param stdout_conn: Dragon connection to stdout + :type stdout_conn: Connection + :return: string of output received on stdout + :rtype: str + """ + output = "" + try: + # this is brute force + while True: + output += stdout_conn.recv() + except EOFError: + pass + finally: + stdout_conn.close() + + return output.strip('/n') + + + +def main_policy_example(): + + # an abstraction of my allocated nodes + my_alloc = System() + num_procs_per_node = 8 + node_list = my_alloc.nodes + nnodes = my_alloc.nnodes() + num_nodes_to_use = int(nnodes/2) + + print(f'Using {num_nodes_to_use} of {nnodes}', flush=True) + + nodes = {} + for node_id in node_list: + node = Node(node_id) + nodes[node.hostname] = node + + for hostname, node in nodes.items(): + print(f'{hostname} has {node.gpu_vendor} GPUs with visible devices: {node.gpus}',flush=True) + + + # define mpi application and my args + exe = os.path.join(os.getcwd(), "mpi_hello") + args = [] + run_dir = os.getcwd() + + # restrict cpu affinity for every member of the group + cpu_affinity =[0, 16, 32, 48, 64, 80, 96, 112] + group_policy = Policy(affinity=Policy.Affinity.SPECIFIC, cpu_affinity=cpu_affinity) + + # Define group and give it the group policy + grp = ProcessGroup(restart=False, pmi_enabled=True, policy=group_policy) + + # Add processes to the group with local policies specifying what node to be placed on + for node_num in range(num_nodes_to_use): + node_name = list(nodes.keys())[node_num] + local_policy = Policy(placement=Policy.Placement.HOST_NAME,host_name=node_name) + grp.add_process(nproc=num_procs_per_node, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE, policy=local_policy)) + + grp.init() + grp.start() + group_procs = [Process(None, ident=puid) for puid in grp.puids] + for proc in group_procs: + # get info printed to stdout from each rank + if proc.stdout_conn: + stdout = parse_results(proc.stdout_conn) + print(f'{proc.puid} returned output: {stdout}', flush=True) + + # wait for workers to finish and shutdown process group + grp.join() + grp.stop() + + +if __name__ == "__main__": + main_policy_example() + diff --git a/examples/dragon_workflows/lazy_attach.py b/examples/dragon_workflows/lazy_attach.py new file mode 100644 index 0000000..daa00a6 --- /dev/null +++ b/examples/dragon_workflows/lazy_attach.py @@ -0,0 +1,112 @@ +import dragon +import multiprocessing as mp +import io +import os +import socket +import sys +import time + +from dragon.native.process import ProcessTemplate, Popen, Process +from dragon.native.process_group import ProcessGroup +import dragon.workflows.runtime as runtime + + +def cleanup(conn, q, procs, grp): + conn.close() + del(conn) + del(q) + for p in procs: + del(p) + del(grp) + + +def howdy(q): + q.put(f'howdy from {socket.gethostname()} - local num cores is {os.cpu_count()}, runtime available cores is {mp.cpu_count()}') + + +def signal_exit(): + path = '/home/users/nradclif/hpc-pe-dragon-dragon/examples/dragon_workflows/client_exit' + file = open(path, 'w') + + +def main(): + mp.set_start_method('dragon') + username = os.environ['USER'] + if len(sys.argv) > 1: + system = sys.argv[1] + else: + system = 'hotlum-login' + + runtime_sdesc = runtime.lookup(system, 'my-runtime', 30) + proxy = runtime.attach(runtime_sdesc) + + print('\n') + + # test process and queue + + proxy.enable() + + q = mp.Queue() + procs = [] + + for _ in range(2): + p = mp.Process(target=howdy, args=(q,)) + procs.append(p) + + for p in procs: + p.start() + msg = q.get() + print(f'Message from remote runtime: {msg}', flush=True) + + for p in procs: + p.join() + + # launch the mpi job + + grp = ProcessGroup(restart=False, pmi_enabled=True) + + # TODO: it seems like the client tries to verify that mpi_hello exists locally + num_ranks = 4 + exe = './mpi_hello' + grp.add_process( + nproc=1, + template=ProcessTemplate(target=exe, args=[], env=proxy.get_env(), cwd=os.getcwd(), stdout=Popen.PIPE) + ) + grp.add_process( + nproc=num_ranks - 1, + template=ProcessTemplate(target=exe, args=[], env=proxy.get_env(), cwd=os.getcwd(), stdout=Popen.DEVNULL) + ) + grp.init() + grp.start() + + while None in grp.puids: + time.sleep(2) + + # get remote runtime's stdout + + child_resources = [Process(None, ident=puid) for puid in grp.puids] + conn = child_resources[0].stdout_conn + try: + while True: + print(f'{conn.recv()}', flush=True) + except EOFError: + pass + + # wait for MPI job to complete + + grp.join() + grp.stop() + + # signal client's exit + + exit_proc = mp.Process(target=signal_exit, args=()) + exit_proc.start() + exit_proc.join() + + cleanup(conn, q, procs, grp) + + proxy.disable() + + +if __name__ == "__main__": + main() diff --git a/examples/dragon_workflows/mpi_hello.c b/examples/dragon_workflows/mpi_hello.c new file mode 100644 index 0000000..d1051a4 --- /dev/null +++ b/examples/dragon_workflows/mpi_hello.c @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include + + +int main() +{ + char filename[128]; + //sprintf(filename, "mpi_hello.%d.log", getpid()); + //FILE *log = fopen(filename, "w"); + FILE *log = stdout; + + fprintf(log, "Starting MPI process with pid %d\n", getpid()); + fflush(log); + + MPI_Init(NULL, NULL); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + fprintf(log, "Hello there, my name is MPI Process %d\n", rank); + fflush(log); + + int send_data = 3 * rank; + int *recv_buf = malloc(world_size * sizeof(int)); + assert(recv_buf != NULL); + + int mpi_err = MPI_Allgather(&send_data, 1, MPI_INT, + recv_buf, 1, MPI_INT, + MPI_COMM_WORLD); + assert(mpi_err == MPI_SUCCESS); + + int i; + int num_failed = 0; + + for (i = 0; i < world_size; ++i) { + int expected = 3 * i; + fprintf(log, "Received %d from rank %d; expected %d\n", recv_buf[i], i, expected); + if (recv_buf[i] != expected) { + ++num_failed; + } + } + + //unlink(filename); + + free(recv_buf); + fclose(log); + MPI_Finalize(); + + sleep(2); + + return EXIT_SUCCESS; +} diff --git a/examples/dragon_workflows/run_client.sh b/examples/dragon_workflows/run_client.sh new file mode 100755 index 0000000..7b313aa --- /dev/null +++ b/examples/dragon_workflows/run_client.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +dragon -s lazy_attach.py ${1} diff --git a/examples/dragon_workflows/run_server.sh b/examples/dragon_workflows/run_server.sh new file mode 100755 index 0000000..054b960 --- /dev/null +++ b/examples/dragon_workflows/run_server.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +#dragon-cleanup > /dev/null 2>&1 +../../src/bin/dragon-cleanup > /dev/null 2>&1 +rm ./*.log core > /dev/null 2>&1 +rm ~/.dragon/my-runtime > /dev/null 2>&1 +cc -o mpi_hello mpi_hello.c +dragon server.py +rm mpi_hello diff --git a/examples/dragon_workflows/server.py b/examples/dragon_workflows/server.py new file mode 100644 index 0000000..0f0d3d2 --- /dev/null +++ b/examples/dragon_workflows/server.py @@ -0,0 +1,18 @@ +import os +import time +import dragon.infrastructure.parameters as dparm +import dragon.workflows.runtime as runtime + + +def wait_for_exit(): + path = '/home/users/nradclif/hpc-pe-dragon-dragon/examples/dragon_workflows/client_exit' + while not os.path.exists(path): + time.sleep(1) + time.sleep(1) + if dparm.this_process.index == 0: + os.remove(path) + + +sdesc = runtime.publish('my-runtime') +print(f'Runtime serialized descriptor: {sdesc}', flush=True) +wait_for_exit() diff --git a/examples/multiprocessing/README.md b/examples/multiprocessing/README.md index 75b4f3a..2a783da 100644 --- a/examples/multiprocessing/README.md +++ b/examples/multiprocessing/README.md @@ -464,4 +464,17 @@ primes in pipeline stage are [17, 19, 23] primes in pipeline stage are [7, 11, 13] primes in pipeline stage are [2, 3, 5] -$ \ No newline at end of file +$ + +## Pascal Triangle: Shared State Context Demo + +In this example, we demonstrate that the Dragon multiprocessing interface can be used to create a simple shared state example. The manager multiprocessing process and the client multiprocessing process communicate via a shared state spawned by the context multiprocessing process. The main multiprocessing process will start the manager and client multiprocessing processes. The manager process finds the sum of the Pascal triangle array calcualated by the client process. The third multiprocessing process spawned by the context class finds when the Pascal triangle has been completed. The shared state that contains the Pascal triangle array and the Pascal triangle sum is guarded by a lock; only the process that accesses the lock may alter the array and value. + +### Example Output + +``` +> dragon shared_state_pascal_triangle.py --rows 5 +Pascal Triangle Array Calculated for 5 rows from the Pascal row of 0 to the Pascal row of 5 , and the associated sum of the Pascal triangle array. +Pascal Triangle Array [1, 1, 1, 1, 1, 2, 1, 1, 3, 3, 1, 1, 4, 6, 4, 1] +Pascal Triangle Sum: 32 +``` \ No newline at end of file diff --git a/examples/multiprocessing/numpy-mpi4py-examples/README.md b/examples/multiprocessing/numpy-mpi4py-examples/README.md index 4e206d3..2544a5d 100644 --- a/examples/multiprocessing/numpy-mpi4py-examples/README.md +++ b/examples/multiprocessing/numpy-mpi4py-examples/README.md @@ -29,7 +29,7 @@ In order to run multi-node with Dragon, the `--dragon` option is needed. For exa ### Usage -Simple SciPy example implemented in Parsl and comparable to the implementation above. This example uses the batched dragon executor that utilizes a multiprocessing pool and submits batches of work through the map_async function. +Simple SciPy example implemented in Parsl and comparable to the implementation above. This example uses the batched dragon executor that utilizes a multiprocessing pool and submits batches of work through the map_async function. ``` dragon parsl_batched_scipy_scale_work.py [-h] [--num_workers NUM_WORKERS] [--iterations ITERATIONS] [--burns BURN_ITERATIONS] [--size ARRAY_SIZE] diff --git a/examples/multiprocessing/numpy-mpi4py-examples/numpy_scale_work.py b/examples/multiprocessing/numpy-mpi4py-examples/numpy_scale_work.py index 1c4ba20..2770d73 100644 --- a/examples/multiprocessing/numpy-mpi4py-examples/numpy_scale_work.py +++ b/examples/multiprocessing/numpy-mpi4py-examples/numpy_scale_work.py @@ -54,7 +54,7 @@ def g(size): multiprocessing.util.log_to_stderr(logging.DEBUG) - print(f"Multiprocessing start method: {multiprocessing.get_start_method()}") + print(f"Multiprocessing start method: {multiprocessing.get_start_method()}", flush=True) num_cpus = args.num_workers ar_size = int(1e3 * args.work_size) diff --git a/examples/multiprocessing/shared_state_pascal_triangle.py b/examples/multiprocessing/shared_state_pascal_triangle.py new file mode 100644 index 0000000..3da2d29 --- /dev/null +++ b/examples/multiprocessing/shared_state_pascal_triangle.py @@ -0,0 +1,313 @@ +"""Pascal Triangle Shared State Example + +The Pascal Triange is a famous mathematics concept that gives the binomial coefficients for any binomial expansion. +The Pascal triangle row is constructed by summing up the elements in the preceding row. +The following example has an implementation of the Pascal triangle where the user provides the number of rows. +The main multiprocessing process starts the manager multiprocessing process and the client multiprocessing process. +The manager multiprocessing process starts the context multiprocessing process which creates a shared state with the array that contains all the elements of the Pascal array +and the value that is the sum of the Pascal triangle. +The manager, client, and context multiprocessing processes share the state and pass it to each other via a series of queues. +The manager process increments the value, and the client process adds rows from the Pascal triangle to the Pascal triangle array. +The context process uses an event to signal when the number of rows provided by the user has been reached, the Pascal triangle array has been filled, +and the sum of the Pascal triangle has been calculated with the expected value. +The main process outputs the Pascal triangle array and the Pascal triangle sum. +The shared state is guarded by a lock, and the process that is accessing and modifying the state needs the lock. +""" + + +import cloudpickle +import dragon +import multiprocessing as mp +import time +import sys +import argparse + + +def pascal(rows): + # create pascal array for index + rows -= 1 + pascal_row = [1] + for row in range(max(rows, 0)): + pascal_row.append(pascal_row[row] * (rows - row) // (row + 1)) + return pascal_row + + +def create_shared_state(queue, rows, shared_state_queue_created): + # create value and array for shared state + value = mp.Value("i", 0) + # create the array the size of the completed Pascal triangle array + array = mp.Array("i", [0] * sum(range(rows + 1))) + # shared state that is used by all the processes + shared_state = cloudpickle.dumps((value, array)) + # place the Pascal queue + queue.put(shared_state) + # the context process does not create the shared state again + shared_state_queue_created.set() + + +def context(serialized_args: bytes) -> None: + """ + Context checks if Pascal sum from mp.Value is correct + """ + # if a certain length is reached, outputs the Pascal triangle + ( + shared_state_queue, + lock_queue, + rows, + pascal_event, + shared_state_queue_created, + final_shared_state_queue, + ) = cloudpickle.loads(serialized_args) + # if pascal event is set + while not pascal_event.is_set(): + # create lock + lock = None + if lock_queue.empty(): + lock = mp.Lock() + else: + try: + lock = cloudpickle.loads(lock_queue.get(timeout=1)) + except: + pass + # if lock is not None, enter this step + if lock is not None: + with lock: + # create the shared state if shared state not set + if shared_state_queue.empty() and not shared_state_queue_created.is_set(): + create_shared_state(shared_state_queue, rows, shared_state_queue_created) + else: + value, array = None, None + try: + # grab the value and array from the pascal creation queue + value, array = cloudpickle.loads(shared_state_queue.get(timeout=1)) + except: + pass + # this comparison happens outside the try and except; checks that array and value exists and checks that the operations on Pascal triangle are complete. + if value is not None and array is not None and value.value == (2**rows): + # set pascal event and break out + pascal_event.set() + array = [1] + array[:] + # put the values in the queue that communicates with master + final_shared_state_queue.put(cloudpickle.dumps((value, array))) + break + # place value and array back in the pascal queue + shared_state_queue.put(cloudpickle.dumps((value, array))) + # put lock back in lock queue + lock_queue.put(cloudpickle.dumps(lock)) + time.sleep(0.1) + else: + pass + + +def start_context(serialized_args: bytes) -> None: + ( + shared_state_queue, + lock_queue, + rows, + pascal_event, + shared_state_queue_created, + final_shared_state_queue, + ) = cloudpickle.loads(serialized_args) + context_proc = mp.Process(target=context, args=(serialized_args,)) + context_proc.start() + # join the context process if the pascal_event is set + if pascal_event.is_set(): + context_proc.join() + + +def manager(serialized_args: bytes) -> None: + """ + Manager sums Pascal triangle array for mp.Value + """ + # receive context, pascal queue where value and array are passed, lock queue where lock is passed, rows that is passed to context, pascal event and addition event + ( + context, + shared_state_queue, + lock_queue, + rows, + pascal_event, + addition_event, + pascal_iterator, + context_event, + shared_state_queue_created, + final_shared_state_queue, + ) = list(cloudpickle.loads(serialized_args)) + # start context with the pascal queue, lock queue, rows, and pascal event + if not context_event.is_set(): + context_serialized_args = cloudpickle.dumps( + ( + shared_state_queue, + lock_queue, + rows, + pascal_event, + shared_state_queue_created, + final_shared_state_queue, + ) + ) + start_context(context_serialized_args) + # do not create any more context processes + context_event.set() + # keep manager process alive while pascal event is not set + while not pascal_event.is_set(): + # check if the addition event is set where a new row was added + try: + lock = cloudpickle.loads(lock_queue.get(timeout=1)) + with lock: + # client added a row to the pascal array + if addition_event.is_set(): + # grab value and array from pascal queue + value, array = None, None + try: + value, array = cloudpickle.loads(shared_state_queue.get(timeout=1)) + except: + pass + # this comparison happens outside the try and except; check if the value needs to be incremented + if ( + value is not None + and array is not None + and value.value < (2**rows) + and pascal_iterator.value < rows + ): + # find the sum of the Pascal triangle array + value.value = sum((array[:])) + 1 + # clear addition event + addition_event.clear() + # add another row to the Pascal triangle + pascal_iterator.value += 1 + # put the value and array in the pascal queue + shared_state_queue.put(cloudpickle.dumps((value, array))) + # put the lock in the lock queue + lock_queue.put(cloudpickle.dumps(lock)) + # manager finished summation of pascal array + addition_event.clear() + except: + pass + + +def client(serialized_args: bytes) -> None: + """ + Client adds array to mp.Array + """ + ( + shared_state_queue, + lock_queue, + pascal_event, + addition_event, + rows, + pascal_iterator, + index_iterator, + ) = list(cloudpickle.loads(serialized_args)) + # keep client process alive while pascal event is not set + while not pascal_event.is_set(): + try: + lock = cloudpickle.loads(lock_queue.get(timeout=1)) + with lock: + # manager has completed addition event + if not addition_event.is_set(): + # grab value and array from pascal queue + value, array = None, None + try: + value, array = cloudpickle.loads(shared_state_queue.get(timeout=1)) + except: + pass + # this comparison happens outside the try and except; if the Pascal value is less than what is expected from a Pascal triangle of that size and the index within the Pascal triangle array is less than the largest index of the array + if ( + value is not None + and array is not None + and value.value < (2**rows) + and index_iterator.value <= (sum(range(rows))) + ): + # create the array for the index of interest + pascal_row = pascal_iterator.value + new_array = pascal(pascal_row + 1) + # add the element from the array generated to the Pascal triangle array + for element in new_array: + array[index_iterator.value] = element + index_iterator.value += 1 + # put the value and array in the pascal queue + shared_state_queue.put(cloudpickle.dumps((value, array))) + # put the lock in the lock queue + lock_queue.put(cloudpickle.dumps(lock)) + # array has been added to pascal triangle array + addition_event.set() + except: + pass + + +def main(): + # create parser that grabs the row of interest from the user + parser = argparse.ArgumentParser(description="Pascal Triangle Test") + # the default argument is 5 + parser.add_argument("--rows", type=int, default=5, help="number of rows in Pascal triangle") + my_args = parser.parse_args() + rows = my_args.rows + # pascal queue is used for creating the pascal triangle array and value for pascal triangle, lock is passed between processes, and answer queue is used to pass the final pascal triangle between manager and context + shared_state_queue, lock_queue, final_shared_state_queue = mp.Queue(), mp.Queue(), mp.Queue() + # pascal event signals completion of event, addition process signals that the client process added another row of the Pascal triangle to the array, context event is used to signal context process is created, and shared_state_queue_created signals that the shared state is created + pascal_event, addition_event, context_event, shared_state_queue_created = ( + mp.Event(), + mp.Event(), + mp.Event(), + mp.Event(), + ) + # pascal iterator provides row of the Pascal triangle and the index iterator provides the index in the Pascal triangle array + pascal_iterator, index_iterator = mp.Value("i", 0), mp.Value("i", 0) + # client adds the rows to the pascal triangle array until pascal event is triggered. Adds rows when addition event is set. + client_serialized_args = cloudpickle.dumps( + ( + shared_state_queue, + lock_queue, + pascal_event, + addition_event, + rows, + pascal_iterator, + index_iterator, + ) + ) + # manager starts context. Adds to the triangle value and sets addition event. Waits on pascal event to be triggered. + manager_serialized_args = cloudpickle.dumps( + ( + context, + shared_state_queue, + lock_queue, + rows, + pascal_event, + addition_event, + pascal_iterator, + context_event, + shared_state_queue_created, + final_shared_state_queue, + ) + ) + manager_proc = mp.Process(target=manager, args=(manager_serialized_args,)) + client_proc = mp.Process(target=client, args=(client_serialized_args,)) + # start manager process + manager_proc.start() + # context created + context_event.wait() + # start client process once manager and context processes started + client_proc.start() + # pascal triangle array filled + pascal_event.wait() + value, array = cloudpickle.loads(final_shared_state_queue.get(timeout=1)) + # print the Pascal triangle statistics + print( + "Pascal Triangle Array Calculated for", + rows, + "rows from the Pascal row of 0 to the Pascal row of", + rows, + ", and the associated sum of the Pascal triangle array.", + flush=True, + ) + print("Pascal Triangle Array", array[:], flush=True) + print("Pascal Triangle Sum:", value.value, flush=True) + # join the manager and client processes + manager_proc.join() + client_proc.join() + + +if __name__ == "__main__": + # set dragon start process + mp.set_start_method("dragon") + # start main process + main() diff --git a/examples/multiprocessing/unittests/common.py b/examples/multiprocessing/unittests/common.py index 47f5ead..d40b59a 100644 --- a/examples/multiprocessing/unittests/common.py +++ b/examples/multiprocessing/unittests/common.py @@ -9,13 +9,13 @@ import gc import test.support -try: +try: from test.support.import_helper import import_module - from test.support.threading_helper import join_thread + from test.support.threading_helper import join_thread except ImportError: #location prior to Python 3.10 from test.support import import_module - from test.support import join_thread + from test.support import join_thread import threading diff --git a/examples/multiprocessing/unittests/test_others.py b/examples/multiprocessing/unittests/test_others.py index 71341dc..6a13b61 100644 --- a/examples/multiprocessing/unittests/test_others.py +++ b/examples/multiprocessing/unittests/test_others.py @@ -1245,7 +1245,7 @@ def test_namespace(self): class MiscTestCase(unittest.TestCase): def test__all__(self): # Just make sure names in blacklist are excluded - try: + try: test.support.check__all__( self, multiprocessing, extra=multiprocessing.__all__, not_exported=["SUBDEBUG", "SUBWARNING"] ) diff --git a/examples/multiprocessing/unittests/test_pool.py b/examples/multiprocessing/unittests/test_pool.py index 392df9f..4294ef5 100644 --- a/examples/multiprocessing/unittests/test_pool.py +++ b/examples/multiprocessing/unittests/test_pool.py @@ -260,7 +260,7 @@ def test_imap_unordered_handle_iterable_exception(self): self.assertIn(value, expected_values) expected_values.remove(value) - + def test_make_pool(self): expected_error = RemoteError if self.TYPE == "manager" else ValueError diff --git a/examples/multiprocessing/unittests/test_process.py b/examples/multiprocessing/unittests/test_process.py index 3681eb1..02e2400 100644 --- a/examples/multiprocessing/unittests/test_process.py +++ b/examples/multiprocessing/unittests/test_process.py @@ -11,15 +11,15 @@ import unittest import test.support -try: - from test.support.os_helper import fd_count - from test.support.os_helper import TESTFN - from test.support.os_helper import unlink +try: + from test.support.os_helper import fd_count + from test.support.os_helper import TESTFN + from test.support.os_helper import unlink except ImportError: #location prior to Python 3.10 from test.support import fd_count - from test.support import TESTFN - from test.support import unlink + from test.support import TESTFN + from test.support import unlink import threading diff --git a/examples/multiprocessing/unittests/test_queue.py b/examples/multiprocessing/unittests/test_queue.py index 4199896..6a9a17a 100644 --- a/examples/multiprocessing/unittests/test_queue.py +++ b/examples/multiprocessing/unittests/test_queue.py @@ -6,12 +6,12 @@ import test.support from test.support import hashlib_helper -try: +try: from test.support.os_helper import temp_cwd from test.support.import_helper import DirsOnSysPath except ImportError: #location prior to Python 3.10 - from test.support import temp_cwd + from test.support import temp_cwd from test.support import DirsOnSysPath diff --git a/examples/workflows/ai-in-the-loop/README.md b/examples/workflows/ai-in-the-loop/README.md index caf64c3..16998e0 100755 --- a/examples/workflows/ai-in-the-loop/README.md +++ b/examples/workflows/ai-in-the-loop/README.md @@ -1,4 +1,4 @@ -# AI-in-the-loop workflow with Dragon +# AI-in-the-loop workflow with Dragon ## Introduction This is an example of how Dragon can be used to execute an AI-in-the-loop workflow. Inspiration for this demo comes from the NERSC-10 Workflow Archetypes White Paper. This workflow most closely resembles the workflow scenario given as part of archetype four. In this example we use a small model implemented in PyTorch to compute an approximation to sin(x). In parallel to doing the inference with the model, we launch `sim-cheap` on four ranks. This MPI job computes the taylor approximation to sin(x) and compares this with the output of the model. If the difference is less than 0.05 we consider the model's approximation to be sufficiently accurate and print out the result with the exact result. If the difference is larger than 0.05 we consider this a failure and re-train the model on a new set of data. To generate this data we launch `sim-expensive`. This MPI job is launched on eight ranks-per-node and each rank generates 32 data points of the form (x, sin(x)) where x is sampled uniformly in [-pi, pi). This data is aggregated into a PyTorch tensor and then used to train the model. We then re-evaluate the re-trained model and decide if we need to re-train again or if the estimate is sufficiently accurate. We continue this loop until we've had five successes. @@ -8,29 +8,29 @@ Below is a diagram of the main computational loop. ⬇ Parallel Execution ⬅ Re-train the AI Model ⬇ ⬇ - Infer Calculate -value from comparison + Infer Calculate +value from comparison AI Model using four ⬆ rank MPI job ⬇ ⬇ Parallel Execution ⬇ Is the inferred No Launch expensive MPI process - value within ⮕ to generate new data + value within ⮕ to generate new data tolerance? ⬇ Yes -``` +``` ## Usage -`ai-in-the-loop.py` - This is the main file. It contains functions for launching both MPI executables and parsing the results as well as imports functions defined in `model.py` and coordinates the model inference and training with the MPI jobs. +`ai-in-the-loop.py` - This is the main file. It contains functions for launching both MPI executables and parsing the results as well as imports functions defined in `model.py` and coordinates the model inference and training with the MPI jobs. -`model.py` - This file defines the model and provides some functions for model training and inference. +`model.py` - This file defines the model and provides some functions for model training and inference. `sim-expensive.c` - This contains what we are considering the expensive MPI job. It computes (x, sin(x)) data points that are used to train the model. -`sim-cheap.c` - This is the cheap approximation. It computes the Taylor approximation of sin(x). +`sim-cheap.c` - This is the cheap approximation. It computes the Taylor approximation of sin(x). `Makefile` - Used to build the two MPI applications. @@ -40,9 +40,9 @@ value from comparison usage: dragon ai-in-the-loop.py ``` -## Installation +## Installation -After installing dragon, the only other dependency is on PyTorch and SciPy. The PyTorch version and corresponding pip command can be found here (https://pytorch.org/get-started/locally/). +After installing dragon, the only other dependency is on PyTorch and SciPy. The PyTorch version and corresponding pip command can be found here (https://pytorch.org/get-started/locally/). ``` > pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu @@ -58,12 +58,12 @@ For this example, HPE Cray Hotlum nodes were used. Each node has AMD EPYC 7763 6 ### Multi-node -The default parameters are for 16 nodes but this example has been run up to 64 nodes with 8 ranks-per-node. +The default parameters are for 16 nodes but this example has been run up to 64 nodes with 8 ranks-per-node. ``` > make gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-cheap.o sim-cheap.c gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib sim-cheap.o -o sim-cheap -lm -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -lmpich -gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-expensive.o +gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-expensive.o gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib sim-expensive.o -o sim-expensive -lm -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -lmpich > salloc --nodes=16 --exclusive > dragon ai-in-the-loop.py diff --git a/examples/workflows/ai-in-the-loop/ai-in-the-loop.py b/examples/workflows/ai-in-the-loop/ai-in-the-loop.py index 8f8beab..1adab60 100755 --- a/examples/workflows/ai-in-the-loop/ai-in-the-loop.py +++ b/examples/workflows/ai-in-the-loop/ai-in-the-loop.py @@ -7,7 +7,7 @@ from itertools import count from model import Net, make_features, infer, train -from dragon.native.process import Process, TemplateProcess, Popen +from dragon.native.process import Process, ProcessTemplate, Popen from dragon.native.process_group import ProcessGroup from dragon.infrastructure.connection import Connection from dragon.native.machine import System @@ -70,12 +70,12 @@ def generate_data( grp = ProcessGroup(restart=False, pmi_enabled=True) # Pipe the stdout output from the head process to a Dragon connection - grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) + grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) # All other ranks should have their output go to DEVNULL grp.add_process( nproc=num_ranks - 1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), ) # start the process group grp.init() @@ -111,12 +111,12 @@ def compute_cheap_approx(num_ranks: int, x: float) -> float: grp = ProcessGroup(restart=False, pmi_enabled=True) # Pipe the stdout output from the head process to a Dragon connection - grp.add_process(nproc=1, template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) + grp.add_process(nproc=1, template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.PIPE)) # All other ranks should have their output go to DEVNULL grp.add_process( nproc=num_ranks - 1, - template=TemplateProcess(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), + template=ProcessTemplate(target=exe, args=args, cwd=run_dir, stdout=Popen.DEVNULL), ) # start the process group grp.init() diff --git a/examples/workflows/parsl/README.md b/examples/workflows/parsl/README.md index 811ae30..663b290 100755 --- a/examples/workflows/parsl/README.md +++ b/examples/workflows/parsl/README.md @@ -3,11 +3,11 @@ ## Introduction This shows an example workflow using Parsl with Dragon. In this example we use a Dragon implementation of the `@mpi_app` decorator and the `DragonMPIExecutor`. The executor expects five arguments to be returned from the decorated function: the executable, the directory containing the executable, the policy for process placement, the number of MPI processes to launch, and the arguments to pass to the executable. The arguments are expected to be returned in this order. The executor returns a future thats result is a dictionary containing a connection to stdin and stdout to rank 0. -In this example we compute the factorial of the largest MPI rank. We multiply this factorial by a scale factor that is sent using the stdin connection and add a bias to the scaled factorial that is passed to the MPI app via the args. The result is printed out by rank 0 and received by the head process from the stdout connection. This result is printed out and compared to the expected exact solution. +In this example we compute the factorial of the largest MPI rank. We multiply this factorial by a scale factor that is sent using the stdin connection and add a bias to the scaled factorial that is passed to the MPI app via the args. The result is printed out by rank 0 and received by the head process from the stdout connection. This result is printed out and compared to the expected exact solution. ## Usage -`parsl_mpi_app_demo.py` - This is the main file. It contains the `@mpi_app` decorated function with the required return arguments for that function. It also has the two functions used for sending data to and receiving data from stdin and stdout, respectively. +`parsl_mpi_app_demo.py` - This is the main file. It contains the `@mpi_app` decorated function with the required return arguments for that function. It also has the two functions used for sending data to and receiving data from stdin and stdout, respectively. `factorial.c` - This contains what the MPI application that computes the factorial, scales it by the scale factor received from the stdin connection, and then adds the bias from the args to it. @@ -17,12 +17,12 @@ In this example we compute the factorial of the largest MPI rank. We multiply th usage: dragon parsl_mpi_app_demo.py ``` -## Installation +## Installation After installing dragon, the only other dependency is on Parsl. The command to install Parsl is ``` -> pip install parsl +> pip install parsl ``` ## Example Output @@ -35,5 +35,5 @@ gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/include -L /opt gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib factorial.o -o factorial -lm -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib -lmpich > salloc --nodes=2 --exclusive >$dragon dragon parsl_mpi_app_demo.py -mpi computation: 0.000100 * 362880.000000 + 10.000000 = 46.288000 , exact = 46.288000000000004 +mpi computation: 0.000100 * 362880.000000 + 10.000000 = 46.288000 , exact = 46.288000000000004 ``` diff --git a/external/Makefile b/external/Makefile index 35fe235..7241128 100644 --- a/external/Makefile +++ b/external/Makefile @@ -1,8 +1,40 @@ +ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) +export CXXFLAGS = -fPIC +export CPPFLAGS = -fPIC +#export LDFLAGS = -L$(ROOT_DIR)/../_env/lib CC = gcc CXX = g++ +configure: configure-cpython build: build-cpython -clean: clean-cpython +clean: clean-cpython clean-capnproto +# Don't add to main build target. This is targeted directly in src/Makefile. +build-capnproto: +ifeq ($(wildcard capnproto/c++/capnp), ) + git clone https://github.com/capnproto/capnproto.git + cd capnproto && git checkout master + cd capnproto/c++ && find . -type f -not -path '*/\.*' -exec sed -i 's/KJ_LOG(WARNING,/KJ_LOG(INFO,/g' {} + + cd capnproto/c++ && autoreconf -i && ./configure --prefix=$(ROOT_DIR)/../_env --disable-shared + cd capnproto/c++ && make && make install + cd capnproto/c++ && autoreconf -i && ./configure --prefix=$(ROOT_DIR)/../_env + git clone https://github.com/capnproto/pycapnp.git + @echo "capnproto/c++/capnp and pycapnp exist, now building." + cd capnproto/c++ && make && make install + cp -f $(ROOT_DIR)/../_env/lib/libcapnp.a $(ROOT_DIR)/../src/lib + cp -f $(ROOT_DIR)/../_env/lib/libkj.a $(ROOT_DIR)/../src/lib + ln -sf $(ROOT_DIR)/../_env/include/capnp $(ROOT_DIR)/../src/include/ + ln -sf $(ROOT_DIR)/../_env/include/kj $(ROOT_DIR)/../src/include/ + cd pycapnp && \ + PKG_CONFIG_PATH=$(ROOT_DIR)/../_env/lib/pkgconfig:$PKG_CONFIG_PATH \ + PATH=$(ROOT_DIR)/../_env/bin:${PATH} \ + LDFLAGS=-L$(ROOT_DIR)/../_env/lib CFLAGS=-I$(ROOT_DIR)/../_env/include \ + pip --verbose wheel . + cd pycapnp && pip install pycapnp*.whl +endif + +clean-capnproto: + rm -rf capnproto + rm -rf pycapnp configure-cpython: ifneq ($(wildcard cpython_master/Makefile), ) @@ -15,6 +47,7 @@ else endif endif + build-cpython: configure-cpython ifneq ($(DRAGON_CPYTHON_PATH), ) cd cpython_master && make -j && make install diff --git a/external/capnproto b/external/capnproto new file mode 160000 index 0000000..e7f22da --- /dev/null +++ b/external/capnproto @@ -0,0 +1 @@ +Subproject commit e7f22da9c01286a2b0e1e5fbdf3ec9ab3aa128ff diff --git a/hack/clean_build b/hack/clean_build index 0cc7c4c..297a9bf 100644 --- a/hack/clean_build +++ b/hack/clean_build @@ -9,14 +9,7 @@ module load dragon-dev cd src make distclean cd .. -pythonpath=`which python3` -echo $pythonpath -env_str='_env' -echo $env_str -if [[ "$pythonpath" == *"$env_str"* ]]; then - echo "Deactivating environment." - deactivate -fi +deactivate echo "Building and activating new, clean environment." python3 -m venv --clear _env . _env/bin/activate @@ -24,6 +17,10 @@ python3 -m pip install -U pip python3 -m pip install -r src/requirements.txt -c src/constraints.txt export PATH=$PWD/hack:$PATH echo "Building source code." +cd external +make clean +make build-capnproto +cd .. cd src make python3 setup.py develop diff --git a/src/Doxyfile b/src/Doxyfile index aa00688..fcd0df2 100644 --- a/src/Doxyfile +++ b/src/Doxyfile @@ -958,7 +958,9 @@ RECURSIVE = YES EXCLUDE = bin \ cpython \ modulefiles \ - pkg + pkg \ + _env \ + external # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -974,7 +976,8 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = */kj/* \ + */capnp/* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the diff --git a/src/Makefile b/src/Makefile index 4c186da..3c15661 100644 --- a/src/Makefile +++ b/src/Makefile @@ -17,6 +17,7 @@ TOOLS := tools/dragon-cleanup \ BINARIES := $(addprefix bin/, $(notdir $(TOOLS))) +EXTERNAL := ../external/capnproto CY_FILES := dragon/pydragon_rc.pyx dragon/return_codes.pxd @@ -24,9 +25,13 @@ CY_FILES := dragon/pydragon_rc.pyx dragon/return_codes.pxd all: build .PHONY: build -build: $(LIBRARIES) $(BINARIES) $(CY_FILES) +build: $(EXTERNAL) $(CYFILES) $(LIBRARIES) $(BINARIES) $(PYTHON) setup.py build $(SETUP_PY_BUILD_FLAGS) +.PHONY: $(EXTERNAL) +$(EXTERNAL): + $(MAKE) -C ../external build-capnproto + .PHONY: $(LIBRARIES) $(LIBRARIES): %.so: $(MAKE) -C $(@D) $(@F) @@ -51,7 +56,6 @@ clean: $(MAKE) -C dragon/launcher/src clean $(MAKE) -C lib clean $(MAKE) -C include clean - $(MAKE) -C ../external clean # create a package for release .PHONY: dist @@ -60,7 +64,24 @@ dist: $(addprefix $(INSTALL_DIR)/,$(LIBRARIES) $(BINARIES)) $(CY_FILES) $(MAKE) -C ../external dist $(PYTHON) setup.py clean --all $(PYTHON) setup.py build $(SETUP_PY_BUILD_FLAGS) --cythonize -f - $(PYTHON) setup.py bdist_wheel --skip-build +# Prepare to files to be included in the distribution. + cp ../external/pycapnp/pycapnp*.whl dist + cp bin/dragon-install dist +# This code copies the message_defs.capnp file into the wheel file +# which is needed for the captain proto support. + rm -rf build/dragon_wheel + mkdir build/dragon_wheel/ + mkdir build/dragon_wheel/dragon + mkdir build/dragon_wheel/dragon/infrastructure + cp -f lib/message_defs.capnp build/dragon_wheel/dragon/infrastructure/ +# The following lines do not work to get the library files into the wheel file +# because RPATH is not set correctly in them. Code is left here for now because +# it does show how we might copy the files in should we solve the RPATH problem. +# mkdir build/dragon_wheel/lib +# cp -f lib/libdragon.so build/dragon_wheel/lib +# cp -f lib/libpmod.so build/dragon_wheel/lib +# cp -f lib/libpmsgqueue.so build/dragon_wheel/lib + $(PYTHON) setup.py bdist_wheel --skip-build --bdist-dir=$(abspath build/dragon_wheel) $(MAKE) -C modulefiles dist $(MAKE) -C pkg dist $(MAKE) -C ../test dist diff --git a/src/bin/dragon-install b/src/bin/dragon-install new file mode 100755 index 0000000..a16d8d5 --- /dev/null +++ b/src/bin/dragon-install @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# This is meant to be run as part of an install from release package. +# invoke as: ./dragon-install +# from the untarred directory. It will create a virtual environment +# where you execute the command. To create it someplace else, first +# move the files in this gzipped archive where you want them and then +# execute the ./dragon-install. + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +python3 -m venv --clear _env +. _env/bin/activate +cd $dirname +pycapnpwheel=$(ls $SCRIPT_DIR/pycapnp*.whl) +dragonwheel=$(ls $SCRIPT_DIR/dragon*.whl) +echo "Pycapnp wheel is" $pycapnpwheel +pip3 install --force-reinstall $pycapnpwheel +pip3 install --force-reinstall $dragonwheel +module use $SCRIPT_DIR/modulefiles +module load dragon diff --git a/src/dragon/__init__.py b/src/dragon/__init__.py index 94b8fbf..845c84f 100644 --- a/src/dragon/__init__.py +++ b/src/dragon/__init__.py @@ -58,4 +58,4 @@ def _patch_torch(): _patch_multiprocessing() if bool(strtobool(os.environ.get("DRAGON_PATCH_TORCH", "False"))): - _patch_torch() + _patch_torch() \ No newline at end of file diff --git a/src/dragon/channels.pxd b/src/dragon/channels.pxd index eeb4b40..b82dd1c 100644 --- a/src/dragon/channels.pxd +++ b/src/dragon/channels.pxd @@ -19,3 +19,6 @@ cdef class Channel: derr = dragon_channel_get_pool(&self._channel, pool) if derr != DRAGON_SUCCESS: return (derr, "Could not retrieve memory pool from channel") + + if not dragon_memory_pool_is_local(pool): + pool[0] = self._default_pool._pool_hdl diff --git a/src/dragon/cli/__init__.py b/src/dragon/cli/__init__.py index 0624309..bb7effb 100644 --- a/src/dragon/cli/__init__.py +++ b/src/dragon/cli/__init__.py @@ -20,6 +20,8 @@ def _from_text(text): PROCNAME_LS = 'dragon-localservices' PROCNAME_TCP_TA = 'dragon-tcp' PROCNAME_OVERLAY_TA = 'dragon-overlay-tcp' +PROCNAME_OOB_TA = 'dragon-oob-tcp' +PROCNAME_RDMA_TA = 'dragon-hsta' PROCNAME_NETWORK_CONFIG = 'dragon-network-config' PROCNAME_NETWORK_CONFIG_LAUNCH_HELPER = 'dragon-network-config-launch-helper' PROCNAME_NETWORK_CONFIG_SHUTDOWN_HELPER = 'dragon-network-config-shutdown-helper' diff --git a/src/dragon/data/ddict/__init__.py b/src/dragon/data/ddict/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/dragon/data/ddict/ddict.py b/src/dragon/data/ddict/ddict.py new file mode 100644 index 0000000..5268695 --- /dev/null +++ b/src/dragon/data/ddict/ddict.py @@ -0,0 +1,682 @@ +""" +The Distributed Dictionary is a performant and distributed key-value store +that is available to applications and workflows written for the Dragon ecosystem. + +This is Dragon's specialized implementation based on the Dragon file-like interface +which relies on Dragon Channels. The Distributed Dictionary works like a standard +Python dictionary except that the data that it holds may span multiple nodes and be +larger than any one node can hold. + +The internals of the distributed dictionary rely on several processes include a +single orchestrator process and one or more manager processes. Each client attaches +to managers on an as-needed basis. Clients discover managers by attaching to the +serialized descriptor of a Distributed Dictionary. When using a Distributed Dictionary +in Python, the dictionary will be automitically pickled/serialized and sent to new +processes in the same way a Queue or other objects can be passed as parameters in +multiprocessing. + +While the Distributed Dictionary does its best to evenly distributed data across all +managers, a localized wrapper class can be used to direct key/value pairs to user +chosen managers. See the Distributed Dictionary documentation for more details. + +""" + +import sys +import logging +import traceback +import cloudpickle +import time +import socket +import builtins +import os + + +from ...utils import b64decode, b64encode, hash as dragon_hash, get_local_kv +from ...infrastructure import parameters +from ...infrastructure import messages as dmsg +from ...infrastructure import policy +from ...channels import Channel +from ...native.process import Popen +from ...dlogging.util import setup_BE_logging, DragonLoggingServices as dls +from ...dlogging.logger import DragonLoggingError + +from ... import fli +from ...rc import DragonError + +log = None +KEY_HINT = 1 +VALUE_HINT = 2 +builtin_types = frozenset(dir(builtins)) + +# This is a default timeout value that is used for send/receive operations. +# Longer timeouts can be specified if needed by passing in a timeout on the +# distributed dictionary creation. The timeout applies to all operations that +# could timeout in the distributed dictionary. Likely causes of timeouts are +# a manager being overfilled, but some care is taken that does not occur. +DDICT_DEFAULT_TIMEOUT = 10 + +# This is the default size of a distributed dictionary which would normally be +# overridden. +DDICT_MIN_SIZE = 3*1024**2 # 3 MB + +# This is the generic error that all other Distributed Dictionary specific +# errors inherit from. Other types of exceptions my be raised while using the +# Distributed Dictionary, but specific errors generated by this code +# are provided here. +class DDictError(DragonLoggingError): + def __str__(self): + return f"DDict Exception: {self.msg}\n*** Dragon C-level Traceback: ***\n{self.lib_msg}\n*** End C-level Traceback: ***\nDragon Error Code: {self.lib_err}" + + +# This will be raised when a Distributed Dictionary manager has filled to +# capacity. To rectify this you may need to increase the overall size of the +# dictionary and/or devise a better distributed hashing function. +class DDictManagerFull(DDictError): + pass + +# Timeout errors that occur may be either the generic TimeoutError or +# some exception that inherits from TimeoutError, including the +# DDictTimeoutError given below. If catching these errors in your program +# it is probably best to catch the generic TimeoutError so you catch +# all types of timeout errors. +class DDictTimeoutError(DDictError, TimeoutError): + pass + +class DDict: + # TODO: reorganize __iter__ to use DDictIterator + # class DDictIterator: + # def __init__(self, ddict): + # self._ddict = ddict + # self._manager_idx = 0 + # self._current_iter_id = -1 + + # def __next__(self): + # # send msg(self._current_iter_id) to manager[self._manager_idx] + # # recv resp for the next key + # # check hint -> if EOF or not + # # if EOF: advance to the next manager, send msg to the manager, store iter id, send msg to manager + # # if no more manager: raise StopIteration + # # else: return key + # pass + + def __init__(self, managers_per_node:int=1, n_nodes:int=1, total_mem:int=DDICT_MIN_SIZE, *, + working_set_size:int=1, wait_for_keys:bool=False, wait_for_writers:bool=False, + policy:policy.Policy=None, persist_freq:int=0, persist_base_name:str="", + timeout:float=DDICT_DEFAULT_TIMEOUT) -> None: + """Construct a Distributed Dictionary to be shared amongst distributed processes running + in the Dragon Runtime. The distributed dictionary creates the specified number of managers + and shards the data across all managers. The total memory of the dictionary is split + across all the managers, so you want to allocate more space than is required by perhaps + 30 percent, but that should be determined via some experimentation and depends on the + application being developed. See the Dragon documentation's section on the Distributed + Dictionary design for more details about creating and using a distributed dictionary. + + Args: + managers_per_node (int, optional): The number of managers on each + node. The total_mem is divided up amongst the managers. + Defaults to 1. + + n_nodes (int, optional): The number of nodes that will have managers + deployed on them. Defaults to 1. + + total_mem (int, optional): The total memory in bytes that will be + sharded across all managers. Defaults to DDICT_MIN_SIZE + but this is really a minimum size for a single manager + and should be specified by the user. + + working_set_size (int, optional): Not implemented yet. This sets the + size of the checkpoint, in memory, working set. This + determines how much state each manager will keep + internally. This is the number of different, simultaneous + checkpoints that may be active at any point in time. + Defaults to 1. + + wait_for_keys (bool, optional): Not implemented yet. Setting this to + true means that each manager will keep track of a set of + keys at each checkpoint level and clients advancing to a + new checkpoint level will block until the set of keys at + the oldest, retiring working set checkpoint are all + written. By specifying this all clients will remain in + sync with each other relative to the size of the working + set. Defaults to False. It is also possible to store + key/values that are not part of the checkpointing set of + key/values. Those keys are called persistent keys and + will not be affected by setting this argument to true. + + wait_for_writers (bool, optional): Not implemented yet. Setting this + to true means that each manager will wait for a set of + clients to have all advanced their checkpoint id beyond + the oldest checkpointing id before retiring a checkpoint + from the working set. Setting this to true will cause + clients that are advancing rapidly to block while others + catch up. Defaults to False. + + policy (policy.Policy, optional): A policy can be supplied for + starting the managers. Please read about policies in the + Process Group documentation. Managers are started via a + Process Group and placement of managers and other + characteristics can be controlled via a policy. Defaults + to None which applies a Round-Robin policy. + + persist_freq (int, optional): Not implemented yet. This is the + frequency that a checkpoint will be persisted to disk. + This is independent of the working set size and can be + any frequency desired. Defaults to 0 which means that no + persisting will be done. + + persist_base_name (str, optional): Not implemented yet. This is a + base file name to be applied to persisted state for the + dictionary. This base name along with a checkpoint number + is used to restore a distributed dictionary from a + persisted checkpoint. Defaults to "". + + timeout (float, optional): This is a timeout that will be used for + all timeouts on the creating client and all managers + during communication between the distributed components + of the dictionary. New clients wishing to set their own + timeout can use the attach method to specify their own + local timeout. Defaults to DDICT_DEFAULT_TIMEOUT. + + Raises: + AttributeError: If incorrect parameters are supplied. + RuntimeError: If there was an unexpected error during initialization. + """ + + # This is the pattern used in the pydragon_perf.pyx file + # It works, but may need review if it's the way we want to do it + + # This block turns on client log for the initial client that creates the dictionary + global log + if log == None: + fname = f'{dls.DD}_{socket.gethostname()}_client_{str(parameters.this_process.my_puid)}.log' + setup_BE_logging(service=dls.DD, fname=fname) + log = logging.getLogger(str(dls.DD)) + + try: + # Start the Orchestrator and capture its serialized descriptor so we can connect to it. + if type(managers_per_node) is not int or type(n_nodes) is not int or type(total_mem) is not int: + raise AttributeError('When creating a Dragon Distributed Dict you must provide managers_per_node, n_nodes, and total_mem') + + proc = Popen(executable=sys.executable, args=['-c', + f'import dragon.data.ddict.orchestrator as orc; orc.start({managers_per_node}, {n_nodes}, {total_mem})'], + stdout=Popen.PIPE) + + # Read the serialized FLI of the orchestrator. + ddict = proc.stdout.recv().strip() + + self._orc_connector = fli.FLInterface.attach(b64decode(ddict)) + self._args = (working_set_size, wait_for_keys, wait_for_writers, policy, persist_freq, persist_base_name, timeout) + + self.__setstate__((ddict, timeout)) + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There is an exception initializing ddict: {ex}\n Traceback: {tb}\n') + raise RuntimeError(f'There is an exception initializing ddict: {ex}\n Traceback: {tb}\n') + + def __setstate__(self, args): + serialized_orc, timeout = args + + # This block turns on a client log for each client + # global log + # if log == None: + # fname = f'{dls.DD}_{socket.gethostname()}_client_{str(parameters.this_process.my_puid)}.log' + # setup_BE_logging(service=dls.DD, fname=fname) + # log = logging.getLogger(str(dls.DD)) + + self._managers = dict() + self._tag = 0 + self._destroyed = False + self._detached = False + self._timeout = timeout + + try: + return_channel = Channel.make_process_local() + buffered_return_channel = Channel.make_process_local() + + self._default_pool = return_channel.get_pool() + + self._return_connector = fli.FLInterface(main_ch=return_channel) + self._serialized_return_connector = b64encode(self._return_connector.serialize()) + + self._buffered_return_connector = fli.FLInterface(main_ch=buffered_return_channel, use_buffered_protocol=True) + self._serialized_buffered_return_connector = b64encode(self._buffered_return_connector.serialize()) + + self._serialized_orc = serialized_orc + try: + self._create(b64encode(cloudpickle.dumps((self._args)))) + except AttributeError: + self._orc_connector = fli.FLInterface.attach(b64decode(serialized_orc)) + + self._client_id = None + + self._get_main_manager() + self._register_client_to_main_manager() + except Exception as ex: + tb = traceback.format_exc() + try: + log.debug(f'There is an exception __setstate__ of ddict: {ex}\n Traceback: {tb}\n') + except: + pass + raise RuntimeError(f'There is an exception __setstate__ of ddict: {ex}\n Traceback: {tb}\n') + + def __getstate__(self): + return (self.serialize(), self._timeout) + + def __del__(self): + try: + self.detach() + except Exception as ex: + try: + tb = traceback.format_exc() + log.debug(f'There was an exception while terminating the Distributed Dictionary. Exception is {ex}\n Traceback: {tb}\n') + except: + pass + + def _create(self, pickled_args): + msg = dmsg.DDCreate(self._tag_inc(), respFLI=self._serialized_buffered_return_connector, args=pickled_args) + resp_msg = self._send_receive([(msg, None)], connection=self._orc_connector) + if resp_msg.err != DragonError.SUCCESS: + raise RuntimeError('Failed to create dictionary!') + + def _get_main_manager(self): # SHGetKV + try: + serialized_main_manager = get_local_kv(key=self._serialized_orc) + self._main_manager_connection = fli.FLInterface.attach(b64decode(serialized_main_manager)) + except KeyError as e: + # no manager on the node, get a random manager from orchestrator + try: + log.info(f'Got KeyError {e} during bringup, sending get random manager request to orchestrator') + except: + pass + msg = dmsg.DDGetRandomManager(self._tag_inc(), respFLI=self._serialized_buffered_return_connector) + resp_msg = self._send_receive([(msg, None)], connection=self._orc_connector) + if resp_msg.err != DragonError.SUCCESS: + raise RuntimeError('Client failed to get manager from orchestrator') + self._main_manager_connection = fli.FLInterface.attach(b64decode(resp_msg.manager)) + + def _register_client_to_main_manager(self): # client ID assigned here + msg = dmsg.DDRegisterClient(self._tag_inc(), respFLI=self._serialized_return_connector, bufferedRespFLI=self._serialized_buffered_return_connector) # register client to the manager (same node) + resp_msg = self._send_receive([(msg, None)], connection=self._main_manager_connection) + if resp_msg.err != DragonError.SUCCESS: + raise RuntimeError('Client failed to connect to main manager.') + self._client_id = resp_msg.clientID + self._num_managers = resp_msg.numManagers + + def _connect_to_manager(self, manager_id): + msg = dmsg.DDConnectToManager(self._tag_inc(), clientID=self._client_id, managerID=manager_id) + resp_msg = self._send_receive([(msg, None)], connection=self._main_manager_connection) + if resp_msg.err != DragonError.SUCCESS: + raise RuntimeError(f'Client {self._client_id} failed to coonect to manager {manager_id}') + + self._managers[manager_id] = fli.FLInterface.attach(b64decode(resp_msg.manager)) + + def _register_client_ID_to_manager(self, manager_id): + try: + msg = dmsg.DDRegisterClientID(self._tag_inc(), clientID=self._client_id, respFLI=self._serialized_return_connector, bufferedRespFLI=self._serialized_buffered_return_connector) + resp_msg = self._send_receive([(msg, None)], connection=self._managers[manager_id]) + if resp_msg.err != DragonError.SUCCESS: + raise Exception(f'Failed to register client {self._client_id} to manager {manager_id}') + except Exception as ex: + tb = traceback.format_exc() + try: + log.debug(f'There was an exception registering client ID {self._client_id=} with manager: {ex} \n Traceback: {tb}') + except: + pass + raise RuntimeError(f'There was an exception registering client ID {self._client_id=} with manager: {ex} \n Traceback: {tb}') + + def _check_manager_connection(self, manager_id=None, all=False): + if all: + for manager_id in range(self._num_managers): + if manager_id not in self._managers: + self._connect_to_manager(manager_id) + self._register_client_ID_to_manager(manager_id) + elif not (manager_id in self._managers): + self._connect_to_manager(manager_id) + self._register_client_ID_to_manager(manager_id) + + def _choose_manager_pickle_key(self, key): + # Check to see if there is a user-defined hash function. If so, then + # assume it is deterministic and the same across all nodes and use it. + pickled_key = cloudpickle.dumps(key) + + try: + if key.__hash__.__class__.__name__ == 'method': + return (hash(key) % self._num_managers, pickled_key) + except: + pass + + hash_val = dragon_hash(pickled_key) + return (hash_val % self._num_managers, pickled_key) + + + def _tag_inc(self): + tag = self._tag + self._tag += 1 + return tag + + def _send(self, msglist, connection): + with connection.sendh(timeout=self._timeout) as sendh: + for (msg, arg) in msglist: + if arg is None: + sendh.send_bytes(msg.serialize(), timeout=self._timeout) + else: + # It is a pickled value so don't call serialize. + sendh.send_bytes(msg, arg=arg, timeout=self._timeout) + + def _recv_resp(self): + with self._buffered_return_connector.recvh(timeout=self._timeout) as recvh: + resp_ser_msg, hint = recvh.recv_bytes(timeout=self._timeout) + return dmsg.parse(resp_ser_msg) + + def _recv_dmsg_and_val(self, key): + with self._return_connector.recvh(use_main_as_stream_channel=True, timeout=self._timeout) as recvh: + (resp_ser_msg, hint) = recvh.recv_bytes(timeout=self._timeout) + resp_msg = dmsg.parse(resp_ser_msg) + if resp_msg.err != DragonError.SUCCESS: + raise KeyError(key) + try: + value = cloudpickle.load(file=PickleReadAdapter(recvh=recvh, hint=VALUE_HINT, timeout=self._timeout)) + except Exception as e: + tb = traceback.format_exc() + try: + log.info(f'Exception caught in cloudpickle load: {e} \n Traceback: {tb}') + except: + pass + raise RuntimeError(f'Exception caught in cloudpickle load: {e} \n Traceback: {tb}') + + return value + + def _send_receive(self, msglist, connection): + try: + self._send(msglist, connection) + resp_msg = self._recv_resp() + return resp_msg + + except Exception as ex: + tb = traceback.format_exc() + try: + log.debug(f'There was an exception in the _send_receive in ddict: {ex} \n Traceback: {tb}') + except: + pass + raise RuntimeError(f'There was an exception in the _send_receive in ddict: {ex} \n Traceback: {tb}') + + def destroy(self): + if self._destroyed: + return + + self._destroyed = True + try: + msg = dmsg.DDDestroy(self._tag_inc(), self._client_id, respFLI=self._serialized_buffered_return_connector) + resp_msg = self._send_receive([(msg, None)], connection=self._orc_connector) + except Exception as ex: + tb = traceback.format_exc() + try: + log.debug(f'There was an exception in the destroy: {ex} \n Traceback: {tb}') + except: + pass + + try: + self._orc_connector.detach() + except Exception as ex: + try: + tb = traceback.format_exc() + log.debug(f'There was an exception while detaching orchestrator channel: {ex} \n Traceback: {tb}') + except: + pass + + def serialize(self): + return self._serialized_orc + + @classmethod + def attach(cls, serialized_dict, timeout=0): + new_client = cls.__new__(cls) + new_client.__setstate__((serialized_dict, timeout)) + return new_client + + def __setitem__(self, key, value): + msg = dmsg.DDPut(self._tag_inc(), self._client_id) + manager_id, pickled_key = self._choose_manager_pickle_key(key) + self._check_manager_connection(manager_id) + try: + with self._managers[manager_id].sendh(timeout=self._timeout) as sendh: + sendh.send_bytes(msg.serialize(), timeout=self._timeout) + sendh.send_bytes(pickled_key, arg=KEY_HINT, timeout=self._timeout) + cloudpickle.dump(value, file=PickleWriteAdapter(sendh=sendh, hint=VALUE_HINT, timeout=self._timeout)) + + except TimeoutError as ex: + raise DDictTimeoutError(DragonError.TIMEOUT, f'The operation timed out. This could be a network failure or an out of memory condition.\n{str(ex)}') + + try: + resp_msg = self._recv_resp() + except TimeoutError as ex: + raise DDictTimeoutError(DragonError.TIMEOUT, f'The operation timed out. This could be a network failure or an out of memory condition.\n{str(ex)}') + + if resp_msg.err == DragonError.MEMORY_POOL_FULL: + raise DDictManagerFull(DragonError.MEMORY_POOL_FULL, f"Distributed Dictionary Manager {manager_id} is full. The key/value pair was not stored.", ) + + if resp_msg.err != DragonError.SUCCESS: + raise DDictError(resp_msg.err, 'Failed to store key in the distributed dictionary.') + + def __getitem__(self, key): + msg = dmsg.DDGet(self._tag_inc(), self._client_id) + manager_id, pickled_key = self._choose_manager_pickle_key(key) + self._check_manager_connection(manager_id) + self._send([(msg, None), + (pickled_key, KEY_HINT)], self._managers[manager_id]) + + value = self._recv_dmsg_and_val(key) + return value + + def keys(self): + + keys = [] + self._check_manager_connection(all=True) + for manager_id in range(self._num_managers): + msg = dmsg.DDKeys(self._tag_inc(), self._client_id) + self._send([(msg, None)], self._managers[manager_id]) + with self._return_connector.recvh(use_main_as_stream_channel=True, timeout=self._timeout) as recvh: + resp_ser_msg, _ = recvh.recv_bytes(timeout=self._timeout) + resp_msg = dmsg.parse(resp_ser_msg) + if resp_msg.err != DragonError.SUCCESS: + raise RuntimeError(f'{resp_msg.err}') + done = False + while not done: + try: + key = cloudpickle.load(file=PickleReadAdapter(recvh=recvh, hint=KEY_HINT, timeout=self._timeout)) + keys.append(key) + except EOFError: + done = True + break + return keys + + def values(self): + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def items(self): + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def pop(self, key): + msg = dmsg.DDPop(self._tag_inc(), self._client_id) + manager_id, pickled_key = self._choose_manager_pickle_key(key) + self._check_manager_connection(manager_id) + self._send([(msg, None), + (pickled_key, KEY_HINT)], self._managers[manager_id]) + return self._recv_dmsg_and_val(key) + + def clear(self): + + self._check_manager_connection(all=True) + + for manager_id in range(self._num_managers): + msg = dmsg.DDClear(self._tag_inc(), self._client_id) + self._send([(msg, None)], self._managers[manager_id]) + + for _ in range(self._num_managers): + with self._buffered_return_connector.recvh(timeout=self._timeout) as recvh: + (resp_ser_msg, hint) = recvh.recv_bytes(timeout=self._timeout) + resp_msg = dmsg.parse(resp_ser_msg) + if resp_msg.err != DragonError.SUCCESS: + raise RuntimeError(resp_msg.err) + + def update(self, dict2): + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def popitem(self): + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def copy(self): + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def __contains__(self, key): + msg = dmsg.DDContains(self._tag_inc(), self._client_id) + manager_id, pickled_key = self._choose_manager_pickle_key(key) + self._check_manager_connection(manager_id) + resp_msg = self._send_receive([(msg, None), (pickled_key, KEY_HINT)], + connection=self._managers[manager_id]) + + if resp_msg.err == DragonError.SUCCESS: + return True + + if resp_msg.err == DragonError.KEY_NOT_FOUND: + return False + + raise RuntimeError(resp_msg.err) + + def __len__(self): + self._check_manager_connection(all=True) + + for manager_id in range(self._num_managers): + msg = dmsg.DDGetLength(self._tag_inc(), self._client_id) + sendh = self._send([(msg, None)], self._managers[manager_id]) + + length = 0 + for _ in range(self._num_managers): + resp_msg = self._recv_resp() + if resp_msg.err == DragonError.SUCCESS: + length += resp_msg.length + else: + raise RuntimeError(resp_msg.err) + return length + + def __delitem__(self, key): + self.pop(key) + + # Not yet implemented. + # def __iter__(self): + # """ + # not safe to iterate over dictionary while other clients are modifying + # """ + # try: + # self._check_manager_connection(all=True) + + # for manager_id in range(self._num_managers): + # msg = dmsg.DDGetIterator(self._tag_inc(), self._client_id) + + # resp_msg = self._send_receive([(msg, None)], connection=self._managers[manager_id]) + # if resp_msg.err != DragonError.SUCCESS: + # raise RuntimeError('Fail to get iterator from manager') + # iter_id = resp_msg.iterID + # done = False + # while not done: + # msg = dmsg.DDIteratorNext(tag=self._tag_inc(), clientID=self._client_id, iterID=iter_id) + # self._send([(msg, None)], connection=self._managers[manager_id]) + # with self._return_connector.recvh(use_main_as_stream_channel=True, timeout=self._timeout) as recvh: + # resp_ser_msg, hint = recvh.recv_bytes(timeout=self._timeout) + # resp_msg = dmsg.parse(resp_ser_msg) + # if resp_msg.err == DragonError.NO_MORE_KEYS: + # done = True + # elif resp_msg.err != DragonError.SUCCESS: + # raise RuntimeError('Unable to iterate the next key.') + # else: + # try: + # key = cloudpickle.load(file=PickleReadAdapter(recvh=recvh, hint=KEY_HINT, timeout=self._timeout)) + # yield key + # except Exception as e: + # tb = traceback.format_exc() + # raise e + # except Exception as e: + # tb = traceback.format_exc() + # log.debug(f'Got exception in client iter: {e}\n Traceback: {tb}') + # raise RuntimeError(f'Got exception in client iter: {e}\n Traceback: {tb}') + + def __hash__(self): + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def __equal__(self): + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def __str__(self): # return a iterator for an object + raise NotImplementedError('Not implemented on Dragon Distributed Dictionaries.') + + def dump_state(self): + pass + + def detach(self): + try: + if self._destroyed or self._detached: + try: + log.debug(f'Cannot detach client {self._client_id} from a destroyed/detached dictionary.') + except: + pass + return + + self._detached = True + + for manager_id in self._managers: + try: + msg = dmsg.DDDeregisterClient(self._tag_inc(), clientID=self._client_id, respFLI=self._serialized_buffered_return_connector) + resp_msg = self._send_receive([(msg, None)], connection=self._managers[manager_id]) + if resp_msg.err != DragonError.SUCCESS: + log.debug(f'Error on response to deregister client {self._client_id}') + + self._managers[manager_id].detach() + except: + pass + + except Exception as ex: + try: + tb = traceback.format_exc() + log.debug(f'There was an exception while detaching the client {self._client_id}. Exception: {ex}\n Traceback: {tb}') + except: + pass + +class PickleWriteAdapter: + + def __init__(self, sendh, timeout=None, hint=None): + self._sendh = sendh + self._timeout = timeout + self._hint = hint + + def write(self, b): + try: + self._sendh.send_bytes(b, timeout=self._timeout, arg=self._hint) + except Exception as ex: + tb = traceback.format_exc() + try: + log.debug(f'Caught exception in pickle write: {ex}\n {tb}') + except: + pass + +class PickleReadAdapter: + + def __init__(self, recvh, timeout=None, hint=None): + self._recvh = recvh + self._timeout = timeout + self._hint = hint + + def read(self, size=-1): + try: + data, arg = self._recvh.recv_bytes(size=size, timeout=self._timeout) + assert arg == self._hint + return data + except EOFError: + return b'' + except Exception as ex: + tb = traceback.format_exc() + try: + log.debug(f'Caught exception in pickle read: {ex}\n {tb}') + except: + pass + + def readline(self): + return self.read() diff --git a/src/dragon/data/ddict/manager.py b/src/dragon/data/ddict/manager.py new file mode 100644 index 0000000..ade0471 --- /dev/null +++ b/src/dragon/data/ddict/manager.py @@ -0,0 +1,523 @@ +""" +The Distributed Dictionary is a performant and distributed key-value store +that is available to applications and workflows written for the Dragon ecosystem. + +This is Dragon's specialized implementation based on the Dragon file-like interface +which relies on Dragon Channels. The Distributed Dictionary is to work like a standard +Python dictionary except that the data that it holds may span multiple nodes and be +larger than any one node can hold. + +The manager manages on shard of a distributed database. Selection of the manager is based +on the hashed value of the key being used for the insertion of key/value pairs within +the distributed dictionary. + +""" + +import os +import sys +import logging +import traceback +import time +import socket +import os + +from ...utils import b64decode, b64encode, set_local_kv +from ... import managed_memory as dmem +from ...globalservices import channel +from ...globalservices import pool +from ...infrastructure import facts +from ...infrastructure import parameters +from ...infrastructure import util as dutil +from ...infrastructure import messages as dmsg +from ...infrastructure.channel_desc import ChannelOptions +from ...localservices.options import ChannelOptions as LSChannelOptions +from ...channels import Channel +from ... import fli +from ...rc import DragonError +from .ddict import KEY_HINT, VALUE_HINT, DDictManagerFull +from ...dlogging.util import setup_BE_logging, DragonLoggingServices as dls + +log = None +NUM_STREAM_CHANNELS = 20 +MAX_NUM_CLIENTS = 100000 + +# This is used to make it possible to still use the managed memory pool +# when it is getting fully utilized to be able to respond to requests +# by rejecting additional puts. +RESERVED_POOL_SPACE = 1024**2 + +class Manager: + + _DTBL = {} # dispatch router, keyed by type of message + + def __init__(self, pool_size: int, serialized_return_orc, serialized_main_orc, args): + self._working_set_size, self._wait_for_keys, self._wait_for_writers, self._policy, self._persist_freq, self._persist_base_name, self._timeout = args + self._puid = parameters.this_process.my_puid + fname = f'{dls.DD}_{socket.gethostname()}_manager_{str(self._puid)}.log' + global log + if log == None: + setup_BE_logging(service=dls.DD, fname=fname) + log = logging.getLogger(str(dls.DD)) + + # create manager's return_connector (SHReturn) + self._return_channel = Channel.make_process_local() + self._return_connector = fli.FLInterface(main_ch=self._return_channel, use_buffered_protocol=True) + self._serialized_return_connector = b64encode(self._return_connector.serialize()) + + # create memory pool + _user = os.environ.get('USER', str(os.getuid())) + self._pool_name = f'{facts.DEFAULT_DICT_POOL_NAME_BASE}{os.getpid()}_{_user}' + self._pool_desc = pool.create(pool_size, user_name=self._pool_name) + self._pool = dmem.MemoryPool.attach(self._pool_desc._sdesc) + + # We create these two channels in the default pool to isolate them + # from the manager pool which could fill up. This is to make the + # receiving of messages a bit more resistant to the manager pool + # filling up. + self._fli_main_channel = Channel.make_process_local() + self._fli_manager_channel = Channel.make_process_local() + + # stream channels are created in the pool so when data arrives + # at the manager it is already copied into the manager's pool. + self._stream_channels_descs = [] + self._stream_channels = [] + + sh_channel_options = LSChannelOptions(capacity=10) + gs_channel_options = ChannelOptions(ref_count=True, local_opts=sh_channel_options) + + for i in range(NUM_STREAM_CHANNELS): + desc = channel.create(self._pool.muid, options=gs_channel_options) + self._stream_channels_descs.append(desc) + stream_channel = Channel.attach(desc.sdesc) + self._stream_channels.append(stream_channel) + + self._main_connector = fli.FLInterface(main_ch=self._fli_main_channel, manager_ch=self._fli_manager_channel, + pool=self._pool, stream_channels=self._stream_channels) + self._serialized_main_connector = b64encode(self._main_connector.serialize()) + + self._client_connections_map = {} + self._buffered_client_connections_map = {} + self._kvs = {} + self._key_map = {} + self.iterators = {} + self._serving = False + self._tag = 0 + self._iter_id = 0 + self._abnormal_termination = False + self._managers = [] + self._local_client_id = 0 + + # send the manager information to local service and orchestrator to register manager + self._serialized_main_orc = serialized_main_orc + self._register_with_local_service() + self._serialized_return_orc = serialized_return_orc + self._register_with_orchestrator(serialized_return_orc) + + def _free_resources(self): + + try: + # destroy all client maps + for i in self._buffered_client_connections_map: + self._buffered_client_connections_map[i].detach() + del self._buffered_client_connections_map + for i in self._client_connections_map: + self._client_connections_map[i].detach() + del self._client_connections_map + + self._main_connector.destroy() + self._fli_main_channel.destroy() + self._fli_manager_channel.destroy() + + for i in range(NUM_STREAM_CHANNELS): + channel.destroy(self._stream_channels_descs[i].c_uid) + pool.destroy(self._pool_desc.m_uid) + + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'manager {self._puid} failed to destroy resources. Exception: {ex}\n Traceback: {tb}\n') + + def _register_with_local_service(self): + log.debug(f'manager is sending set_local_kv with {self._serialized_main_orc=}') + set_local_kv(key=self._serialized_main_orc, value=self._serialized_main_connector) + + def _tag_inc(self): + tag = self._tag + self._tag += 1 + return tag + + def _iter_inc(self): + iter_id = self._iter_id + self._iter_id += 1 + return iter_id + + def _send_msg(self, msg, connection, clientID=None): + try: + with connection.sendh(timeout=self._timeout) as sendh: + sendh.send_bytes(msg.serialize(), timeout=self._timeout) + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in the manager _send_msg: {ex} \n Traceback: {tb}') + raise RuntimeError(f'There was an exception in the manager _send_msg: {ex} \n Traceback: {tb}') + + def _recv_msg(self): + with self._return_connector.recvh() as recvh: + # Wait patiently here since it might be a big dictionary and is + # just starting. + resp_ser_msg, hint = recvh.recv_bytes() + + return dmsg.parse(resp_ser_msg) + + def _send_dmsg_and_value(self, resp_msg, connection, key_mem, transfer_ownership=False, clientID=None): + with connection.sendh(use_main_as_stream_channel=True, timeout=self._timeout) as sendh: + sendh.send_bytes(resp_msg.serialize(), timeout=self._timeout) + + if resp_msg.err == DragonError.SUCCESS: + val_list = self._kvs[key_mem] + if transfer_ownership: + self._kvs[key_mem] = [] + for val in val_list: + sendh.send_mem(val, transfer_ownership=transfer_ownership, arg=VALUE_HINT, timeout=self._timeout) + if transfer_ownership: + del self._kvs[key_mem] + del self._key_map[key_mem] + key_mem.free() + + def _register_with_orchestrator(self, serialized_return_orc): + msg = dmsg.DDRegisterManager(self._tag_inc(), self._serialized_main_connector, self._serialized_return_connector) + connection = fli.FLInterface.attach(b64decode(serialized_return_orc)) + self._send_msg(msg, connection) + connection.detach() + resp_msg = self._recv_msg() + if resp_msg.err != DragonError.SUCCESS: + raise Exception(f'Failed to register manager with orchester. Return code: {resp_msg.err}') + self._manager_id = resp_msg.managerID + self._managers = resp_msg.managers + self._client_id_offset = self._manager_id * MAX_NUM_CLIENTS + self._global_client_id = self._client_id_offset + self._serving = True + + def _move_to_pool(self, client_mem): + if client_mem.pool.muid != self._pool.muid: + # we need to move it - if no room don't wait. + try: + new_mem = client_mem.copy(self._pool, timeout=0) + except dmem.DragonMemoryError: + raise DDictManagerFull(DragonError.MEMORY_POOL_FULL, "Could not move data to manager pool.") + finally: + client_mem.free() + + return new_mem + + return client_mem + + def _contains_put_key(self, client_key_mem): + if client_key_mem in self._kvs: + key_mem = self._key_map[client_key_mem] + client_key_mem.free() + ec = DragonError.SUCCESS + else: + key_mem = self._move_to_pool(client_key_mem) + self._key_map[key_mem] = key_mem + ec = DragonError.KEY_NOT_FOUND + + return ec, key_mem + + def _contains_and_free_msg_key(self, msg_key_mem): + if msg_key_mem in self._kvs: + ec = DragonError.SUCCESS + key_mem = self._key_map[msg_key_mem] + else: + key_mem = None + ec = DragonError.KEY_NOT_FOUND + + # We must always free the messages key memory + msg_key_mem.free() + + return ec, key_mem + + def _register_client(self, client_id, respFLI, bufferedRespFLI): + self._client_connections_map[client_id] = fli.FLInterface.attach(b64decode(respFLI)) + self._buffered_client_connections_map[client_id] = fli.FLInterface.attach(b64decode(bufferedRespFLI)) + + def _get_next_client_id(self): + client_id = self._global_client_id + self._local_client_id = (self._local_client_id + 1) % MAX_NUM_CLIENTS + self._global_client_id = self._client_id_offset + self._local_client_id + return client_id + + def dump_state(self): + print(f'\nmanager {self._puid} has {len(self._kvs)} keys stored with it.\n', file=sys.stderr, flush=True) + + def run(self): + try: + while self._serving: + with self._main_connector.recvh() as recvh: + ser_msg, hint = recvh.recv_bytes() + msg = dmsg.parse(ser_msg) + if type(msg) in self._DTBL: + self._DTBL[type(msg)][0](self, msg=msg, recvh=recvh) + else: + self._serving = False + self._abnormal_termination = True + log.debug(f'The message {msg} is not a valid message!') + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in manager:\n{ex}\n Traceback:\n{tb}') + + log.info(f'Manager {self._manager_id} preparing to exit') + log.info(f'Pool utilization percent is {self._pool.utilization}') + log.info(f'Number of keys stored is {len(self._kvs)}') + log.info(f'Free space is {self._pool.free_space}') + log.info(f'The total size of the pool managed by this manager was {self._pool.size}') + + try: + self._free_resources() + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in the manager while free resources:\n{ex}\n Traeback:\n{tb}') + + + @dutil.route(dmsg.DDRegisterClient, _DTBL) + def register_client(self, msg: dmsg.DDRegisterClient, recvh): + try: + client_id = self._get_next_client_id() + self._register_client(client_id=client_id, respFLI=msg.respFLI, bufferedRespFLI=msg.bufferedRespFLI) + resp_msg = dmsg.DDRegisterClientResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS, clientID=client_id, numManagers=len(self._managers)) + self._send_msg(resp_msg, self._buffered_client_connections_map[client_id], client_id) + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in the register_client to manager {self._puid=} for client {self._global_client_id}: {ex}\n Traceback: {tb}\n {msg.respFLI=}') + + @dutil.route(dmsg.DDConnectToManager, _DTBL) + def connect_to_manager(self, msg: dmsg.DDConnectToManager, recvh): + try: + resp_msg = dmsg.DDConnectToManagerResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS, manager=self._managers[msg.managerID]) + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in request manager {msg.managerID} from manager {self._puid=} for client {msg.clientID}: {ex} \n Traceback: {tb}\n {msg.respFLI=}') + raise RuntimeError(f'There was an exception in request manager {msg.managerID} from manager {self._puid=} for client {msg.clientID}: {ex} \n Traceback: {tb}\n {msg.respFLI=}') + + @dutil.route(dmsg.DDRegisterClientID, _DTBL) + def register_clientID(self, msg: dmsg.DDRegisterClientID, recvh): + try: + self._register_client(client_id=msg.clientID, respFLI=msg.respFLI, bufferedRespFLI=msg.bufferedRespFLI) + resp_msg = dmsg.DDRegisterClientIDResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS) + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in the register_clientID to manager {self._puid=} for client {msg.clientID}: {ex} \n Traceback: {tb}\n {msg.respFLI=}') + raise RuntimeError(f'There was an exception in the register_clientID to manager {self._puid=} for client {msg.clientID}: {ex} \n Traceback: {tb}\n {msg.respFLI=}') + + @dutil.route(dmsg.DDDestroyManager, _DTBL) + def destroy_manager(self, msg: dmsg.DDDestroyManager, recvh): + try: + self._serving = False + set_local_kv(key=self._serialized_main_orc, value='') + resp_msg = dmsg.DDDestroyManagerResponse(self._tag_inc(), ref=msg._tag, err=DragonError.SUCCESS) + connection = fli.FLInterface.attach(b64decode(self._serialized_return_orc)) + self._send_msg(resp_msg, connection) + connection.detach() + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception while destroying manager {self._puid=}: {ex} \n Traceback: {tb}\n') + + @dutil.route(dmsg.DDPut, _DTBL) + def put(self, msg: dmsg.DDPut, recvh): + ec = DragonError.MEMORY_POOL_FULL # It will be reset below. + key_mem = None + client_key_mem = None + val_list = [] + + try: + try: + if self._pool.utilization >= 90.0 or self._pool.free_space < RESERVED_POOL_SPACE: + raise DDictManagerFull(DragonError.MEMORY_POOL_FULL, f'DDict Manager {self._manager_id}: Pool reserve limit exceeded.') + + # There is likely room for the key/value pair. + client_key_mem, hint = recvh.recv_mem(timeout=self._timeout) + + assert hint == KEY_HINT + + try: + while True: + val_mem, hint = recvh.recv_mem(timeout=self._timeout) + val_mem = self._move_to_pool(val_mem) + assert hint == VALUE_HINT + val_list.append(val_mem) + except EOFError: + pass + + key_moved = True # it is moved on the next call. + ec, key_mem = self._contains_put_key(client_key_mem) + client_key_mem = None # reset because it may be freed. + # the underlying memory in the pool needs to be cleaned up if we put the same key-value pair into the dictionary + + if ec == DragonError.SUCCESS: + old_vals = self._kvs[key_mem] # free old value memory + self._kvs[key_mem] = [] # just in case of error while freeing + while len(old_vals) > 0: + try: + val = old_vals.pop() + val.free() + except Exception as ex: + log.info(f'There was an error while freeing value being replaced. {ex}') + + self._kvs[key_mem] = val_list + ec = DragonError.SUCCESS + + except Exception as ex: + log.info(f'Manager {self._manager_id} with PUID={self._puid} could not process put request. {ex}') + # Depending on where we got to, these two free's may fail, that's OK. + try: + key_mem.free() + except: + pass + try: + client_key_mem.free() + except: + pass + + while len(val_list) > 0: + val = val_list.pop() + try: + val.free() + except: + pass + + if not recvh.stream_received: + try: + while True: + mem, hint = recvh.recv_mem(timeout=self._timeout) + mem.free() + except EOFError: + pass + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'Caught exception while discarding rest of stream: {ex}\n {tb}') + + ec = DragonError.MEMORY_POOL_FULL + + resp_msg = dmsg.DDPutResponse(self._tag_inc(), ref=msg.tag, err=ec) + + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an unexpected exception in put in the manager, {self._puid=}, {msg.clientID=}: {ex} \n Traceback: {tb}') + raise RuntimeError(f'There was an unexpected exception in put in manager, {self._puid=}, {msg.clientID=}: {ex} \n Traceback: {tb}') + + @dutil.route(dmsg.DDGet, _DTBL) + def get(self, msg: dmsg.DDGet, recvh): + key_mem, hint = recvh.recv_mem(timeout=self._timeout) + assert hint == KEY_HINT + # the underlying memory in the pool needs to be cleanup + ec, key_mem = self._contains_and_free_msg_key(key_mem) + resp_msg = dmsg.DDGetResponse(self._tag_inc(), ref=msg.tag, err=ec) + self._send_dmsg_and_value(resp_msg, self._client_connections_map[msg.clientID], key_mem=key_mem, transfer_ownership=False, clientID=msg.clientID) + + @dutil.route(dmsg.DDPop, _DTBL) + def pop(self, msg: dmsg.DDPop, recvh): + key_mem, hint = recvh.recv_mem(timeout=self._timeout) + assert hint == KEY_HINT + ec, key_mem = self._contains_and_free_msg_key(key_mem) + resp_msg = dmsg.DDPopResponse(self._tag_inc(), ref=msg.tag, err=ec) + self._send_dmsg_and_value(resp_msg, self._client_connections_map[msg.clientID], key_mem=key_mem, transfer_ownership=True) + + @dutil.route(dmsg.DDContains, _DTBL) + def contains(self, msg: dmsg.DDContains, recvh): + key_mem, hint = recvh.recv_mem(timeout=self._timeout) + assert hint == KEY_HINT + ec, key_mem = self._contains_and_free_msg_key(key_mem) + resp_msg = dmsg.DDContainsResponse(self._tag_inc(), ref=msg.tag, err=ec) + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + + @dutil.route(dmsg.DDGetLength, _DTBL) + def get_length(self, msg: dmsg.DDGetLength, recvh): + try: + resp_msg = dmsg.DDGetLengthResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS, length=len(self._kvs)) + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in get_length from manager {self._puid}: {ex} \n Traeback: \n {tb}') + raise RuntimeError(f'There was an exception in get_length from manager {self._puid}: {ex} \n Traeback: \n {tb}') + + @dutil.route(dmsg.DDClear, _DTBL) + def clear(self, msg: dmsg.DDClear, recvh): + # The underlying memory in the pool needs to be cleaned up + # in addition to the key/value maps. + for key in self._kvs: + val_list = self._kvs[key] + self._kvs[key] = [] + while len(val_list) > 0: + val = val_list.pop() + try: + val.free() + except: + pass + + for key_mem in self._key_map: + try: + key_mem.free() + except: + pass + + self._kvs.clear() + self._key_map.clear() + resp_msg = dmsg.DDClearResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS) + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + + @dutil.route(dmsg.DDGetIterator, _DTBL) + def get_iterator(self, msg:dmsg.DDGetIterator, recvh): + iter_id = self._iter_inc() + self.iterators[iter_id] = iter(self._kvs) + resp_msg = dmsg.DDGetIteratorResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS, iterID=iter_id) + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + + @dutil.route(dmsg.DDIteratorNext, _DTBL) + def iterate_next(self, msg:dmsg.DDIteratorNext, recvh): + with self._client_connections_map[msg.clientID].sendh(use_main_as_stream_channel=True, timeout=self._timeout) as sendh: + try: + key = next(self.iterators[msg.iterID]) + resp_msg = dmsg.DDIteratorNextResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS) + sendh.send_bytes(resp_msg.serialize(), timeout=self._timeout) + sendh.send_mem(key, transfer_ownership=False, arg=KEY_HINT, timeout=self._timeout) + except StopIteration: + resp_msg = dmsg.DDIteratorNextResponse(self._tag_inc(), ref=msg.tag, err=DragonError.NO_MORE_KEYS) + del self.iterators[msg.iterID] + sendh.send_bytes(resp_msg.serialize(), timeout=self._timeout) + + @dutil.route(dmsg.DDKeys, _DTBL) + def keys(self, msg: dmsg.DDKeys, recvh): + with self._client_connections_map[msg.clientID].sendh(use_main_as_stream_channel=True, timeout=self._timeout) as sendh: + resp_msg = dmsg.DDKeysResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS) + sendh.send_bytes(resp_msg.serialize(), timeout=self._timeout) + for k in self._kvs.keys(): + sendh.send_mem(k, transfer_ownership=False, arg=KEY_HINT, timeout=self._timeout) + + @dutil.route(dmsg.DDDeregisterClient, _DTBL) + def deregister_client(self, msg: dmsg.DDDeregisterClient, recvh): + try: + self._client_connections_map[msg.clientID].detach() + del self._client_connections_map[msg.clientID] + + resp_msg = dmsg.DDDeregisterClientResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS) + self._send_msg(resp_msg, self._buffered_client_connections_map[msg.clientID], msg.clientID) + + self._buffered_client_connections_map[msg.clientID].detach() + del self._buffered_client_connections_map[msg.clientID] + + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception while deregistering client in manager. Exception: {ex}\n Traceback: {tb}') + raise RuntimeError(f'There was an exception while deregistering client in manager. Exception: {ex}\n Traceback: {tb}') + +def manager_proc(pool_size: int, serialized_return_orc, serialized_main_orc, args): + try: + manager = Manager(pool_size, serialized_return_orc, serialized_main_orc, args) + manager.run() + log.debug('Manager is exiting....') + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception initing the manager: {ex}\n Traceback: {tb}') + raise RuntimeError(f'There was an exception initing the manager: {ex}\n Traceback: {tb}') \ No newline at end of file diff --git a/src/dragon/data/ddict/orchestrator.py b/src/dragon/data/ddict/orchestrator.py new file mode 100644 index 0000000..fefb577 --- /dev/null +++ b/src/dragon/data/ddict/orchestrator.py @@ -0,0 +1,231 @@ +""" The Distributed Dictionary is a performant and distributed key-value store +that is available to applications and workflows written for the Dragon ecosystem. + +This is Dragon's specialized implementation based on the Dragon file-like interface +which relies on Dragon Channels. The Distributed Dictionary is to work like a standard +Python dictionary except that the data that it holds may span multiple nodes and be +larger than any one node can hold. + +The orchestrator's role is to coordinate any activity over the entire distributed +dictionary. For instance it coordinates creation and teardown of the distributed +dictionary. It also provides information to clients that are attaching to the +dictionary. + +""" + +import logging +import traceback +import socket +import cloudpickle + +from ...utils import b64decode, b64encode +from ...globalservices import channel +from ...infrastructure import facts +from ...infrastructure import parameters +from ...infrastructure import util as dutil +from ...infrastructure import messages as dmsg +from ...channels import Channel +from ...native.process import ProcessTemplate +from ...native.process_group import ProcessGroup +from .manager import manager_proc +from ... import fli +from ...rc import DragonError +from ...dlogging.util import setup_BE_logging, DragonLoggingServices as dls + +fname = f'{dls.DD}_{socket.gethostname()}_orchestrator_{str(parameters.this_process.my_puid)}.log' +setup_BE_logging(service=dls.DD, fname=fname) +log = logging.getLogger(str(dls.DD)) + +class Orchestrator: + + _DTBL = {} # dispatch router, keyed by type of message + + def __init__(self, managers_per_node, n_nodes, total_mem): + try: + # Use this connection for the managers to respond to the orchestrator + # so clients sending to main channel are ignored until registration of + # managers is complete. + # This must be done first to do the SH handshake so the SH_RETURN + # channel can safely be used here. + return_channel = Channel.make_process_local() + self._return_connector = fli.FLInterface(main_ch=return_channel, use_buffered_protocol=True) + self._serialized_return_connector = b64encode(self._return_connector.serialize()) + + self._default_muid = facts.default_pool_muid_from_index(parameters.this_process.index) + + self._main_channel_desc = channel.create(self._default_muid) + self._main_channel = Channel.attach(self._main_channel_desc.sdesc) + + self._main_connector = fli.FLInterface(main_ch=self._main_channel, use_buffered_protocol=True) + self._serialized_main_connector = b64encode(self._main_connector.serialize()) + # Bootstrapping code. This is read by the client that created the dictionary. + print(self._serialized_main_connector, flush=True) + + self._serving = True + self._abnormal_termination = False + self._clients = {} + self._next_client_id = 0 + self._next_main_manager_id = 0 + self._timeout = None + + # The managers are a ProcessGroup. May have to record it differently here. + self._num_managers = (n_nodes * managers_per_node) + self.mpool_size = total_mem // self._num_managers + self._manager_connections = [] + self._serialized_manager_flis = [] + self._serialized_manager_return_flis = [] + + self._tag = 0 + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception in the Orchestrator: {ex} \n Traceback: {tb}') + + def run(self): + + # bring up all managers first. Use SH_RETURN as message channel for response messages. + self._num_managers_created = 0 + self._serving_connector = self._main_connector + + # start serving the request sent to main channel + while self._serving: + try: + with self._serving_connector.recvh(timeout=self._timeout) as recvh: + ser_msg, hint = recvh.recv_bytes() + msg = dmsg.parse(ser_msg) + if type(msg) in self._DTBL: + self._DTBL[type(msg)][0](self, msg=msg) + else: + self._serving = False + self._abnormal_termination = True + log.debug(f'The message {msg} was received and is not understood!') + raise RuntimeError(f'The message {msg} was received and is not understood!') + + except Exception as ex: + self._serving = False + self._abnormal_termination = False + tb = traceback.format_exc() + log.debug(f'There was an exception in the Orchestrator: {ex} \n Traceback: {tb}') + raise RuntimeError(f'There was an exception in the Orchestrator: {ex} \n Traceback: {tb}') + + self._free_resources(msg) + log.info('Exiting orchestrator.') + + def _free_resources(self, msg): + self._manager_pool.join() + self._manager_pool.stop() + log.info('Stopped manager pool') + + for i in range(self._num_managers): + self._manager_connections[i].detach() + + self._main_connector.destroy() + channel.destroy(self._main_channel_desc.c_uid) + + resp_msg = dmsg.DDDestroyResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS) + connection = fli.FLInterface.attach(b64decode(self._destroying_client_fli)) + self._send_msg(resp_msg, connection) + connection.detach() + + def _send_msg(self, resp_msg, connection): + try: + with connection.sendh(timeout=self._timeout) as sendh: + sendh.send_bytes(resp_msg.serialize(), timeout=self._timeout) + except Exception as e: + tb = traceback.format_exc() + log.debug(f'There is an exception in orchestrator Exception {e}\n Traceback: {tb}\n.') + + def _tag_inc(self): + tag = self._tag + self._tag += 1 + return tag + + def _clientid_inc(self): + clientid = self._next_client_id + self._next_client_id += 1 + return clientid + + def _get_next_main_manager_id(self): + manager_id = self._next_main_manager_id + self._next_main_manager_id = (self._next_main_manager_id + 1) % self._num_managers + return manager_id + + @dutil.route(dmsg.DDRegisterClient, _DTBL) + def register_client(self, msg: dmsg.DDRegisterClient) -> None: + resp_msg = dmsg.DDRegisterClientResponse(tag=self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS, clientID=self._clientid_inc(), numManagers=self._num_managers) + connection = fli.FLInterface.attach(b64decode(msg.respFLI)) + self._send_msg(resp_msg, connection) + connection.detach() + + @dutil.route(dmsg.DDCreate, _DTBL) + def create(self, msg: dmsg.DDCreate) -> None: + self._create_req_msg_tag = msg.tag + self._create_req_msg_respFLI = msg.respFLI + args = cloudpickle.loads(b64decode(msg.args)) + self._working_set_size, self._wait_for_keys, self._wait_for_writers, self._policy, self._persist_freq, self._persist_base_name, self._timeout = args + template = ProcessTemplate(manager_proc, args=(self.mpool_size, self._serialized_return_connector, self._serialized_main_connector, args), policy=self._policy) + self._manager_pool = ProcessGroup(restart=False) + self._manager_pool.add_process(nproc=self._num_managers, template=template) + self._manager_pool.init() + self._manager_pool.start() + self._serving_connector = self._return_connector + + @dutil.route(dmsg.DDRegisterManager, _DTBL) + def register_manager(self, msg: dmsg.DDRegisterManager) -> None: + try: + self._manager_connections.append(fli.FLInterface.attach(b64decode(msg.mainFLI))) + self._serialized_manager_flis.append(msg.mainFLI) + self._serialized_manager_return_flis.append((msg.respFLI, msg.tag)) + self._num_managers_created += 1 + if self._num_managers_created == self._num_managers: + # send response message to all managers + for i, (m, tag) in enumerate(self._serialized_manager_return_flis): + resp_msg = dmsg.DDRegisterManagerResponse(self._tag_inc(), ref=tag, err=DragonError.SUCCESS, managerID=i, managers=self._serialized_manager_flis) + connection = fli.FLInterface.attach(b64decode(m)) + self._send_msg(resp_msg, connection) + connection.detach() + # send DD create response to client + resp_msg = dmsg.DDCreateResponse(self._tag_inc(), ref=self._create_req_msg_tag, err=DragonError.SUCCESS) + connection = fli.FLInterface.attach(b64decode(self._create_req_msg_respFLI)) + self._send_msg(resp_msg, connection) + connection.detach() + # dictionary created - switch to main connector to serve other requests + self._serving_connector = self._main_connector + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception while registering managers: {ex} \n Traceback: {tb}') + raise RuntimeError(f'There was an exception while registering managers: {ex} \n Traceback: {tb}') + + @dutil.route(dmsg.DDGetRandomManager, _DTBL) + def get_random_manager(self, msg: dmsg.DDGetRandomManager) -> None: + managerFLI = self._serialized_manager_flis[self._get_next_main_manager_id()] + resp_msg = dmsg.DDGetRandomManagerResponse(self._tag_inc(), ref=msg.tag, err=DragonError.SUCCESS, manager=managerFLI) + connection = fli.FLInterface.attach(b64decode(msg.respFLI)) + self._send_msg(resp_msg, connection) + connection.detach() + + @dutil.route(dmsg.DDDestroy, _DTBL) + def destroy_dict(self, msg: dmsg.DDDestroy) -> None: + self._serving_connector = self._return_connector + self._destroyed_managers = 0 + for m in self._manager_connections: + req_msg = dmsg.DDDestroyManager(tag=self._tag_inc(), respFLI=self._serialized_return_connector) + self._send_msg(req_msg, m) + + self._destroying_client_fli = msg.respFLI + + @dutil.route(dmsg.DDDestroyManagerResponse, _DTBL) + def destroy_manager_response(self, msg: dmsg.DDDestroyManagerResponse) -> None: + if msg.err != DragonError.SUCCESS: + raise Exception(f'{msg.err} Failed to destroy manager!') + self._destroyed_managers += 1 + if self._destroyed_managers == self._num_managers: + self._serving = False + +def start(managers_per_node: int, n_nodes: int, total_mem: int): + try: + log.debug('Initing Orchestrator') + orc = Orchestrator(managers_per_node, n_nodes, total_mem) + orc.run() + except Exception as ex: + tb = traceback.format_exc() + log.debug(f'There was an exception initing the orchestrator: {ex} \n Traceback: {tb}') \ No newline at end of file diff --git a/src/dragon/data/distdictionary/dict_managers.py b/src/dragon/data/distdictionary/dict_managers.py index 8cc00a9..9a8bac1 100644 --- a/src/dragon/data/distdictionary/dict_managers.py +++ b/src/dragon/data/distdictionary/dict_managers.py @@ -112,6 +112,9 @@ def manager_proc(pool_size: int, mowner_chnl_name: str) -> None: manager_kv = dict() manager_client_map = dict() + LOG.debug(f'{manager_pool_mem} ; {manager_pool_mem.muid=}') + LOG.debug(f'The pool free space is {manager_pool_mem.free_space} and utilization is {manager_pool_mem.utilization} percent.') + serving = True while serving: try: diff --git a/src/dragon/data/distdictionary/distributed_dict.py b/src/dragon/data/distdictionary/distributed_dict.py index 3641f7e..2cf648c 100644 --- a/src/dragon/data/distdictionary/distributed_dict.py +++ b/src/dragon/data/distdictionary/distributed_dict.py @@ -28,7 +28,7 @@ from ...infrastructure import facts from ...infrastructure import parameters from ...channels import Channel, Message, Many2ManyWritingChannelFile, Many2ManyReadingChannelFile -from ...native.process import TemplateProcess, Process +from ...native.process import ProcessTemplate, Process from ...native.process_group import ProcessGroup from .dict_managers import manager_proc @@ -103,7 +103,7 @@ def start(self, dict_chnl_name): # Initialize the manager pool of workers # Pool workers are manager processes that serve the data to the client args = (mpool_size, mowner_chnl_create.name) - template = TemplateProcess(manager_proc, args=args) + template = ProcessTemplate(manager_proc, args=args) self.manager_pool = managerPool() self.manager_pool.add_process(n_workers, template) self.manager_pool.init() diff --git a/src/dragon/dlogging/util.py b/src/dragon/dlogging/util.py index abe5f34..51551f4 100644 --- a/src/dragon/dlogging/util.py +++ b/src/dragon/dlogging/util.py @@ -127,7 +127,9 @@ class DragonLoggingServices(str, enum.Enum): :GS: 'GS' :TA: 'TA' :ON: 'ON' + :OOB: 'OOB' :LS: 'LS' + :DD: 'DD' :TEST: 'TEST' :PERF: 'PERF' """ @@ -136,7 +138,9 @@ class DragonLoggingServices(str, enum.Enum): GS = 'GS' TA = 'TA' ON = "ON" + OOB = 'OOB' LS = 'LS' + DD = 'DD' TEST = 'TEST' PERF = 'PERF' diff --git a/src/dragon/dtypes_inc.pxd b/src/dragon/dtypes_inc.pxd index be58526..d626266 100644 --- a/src/dragon/dtypes_inc.pxd +++ b/src/dragon/dtypes_inc.pxd @@ -109,9 +109,15 @@ cdef extern from "": uint64_t type_id, uint64_t offset, const dragonULInt* bytes_size) nogil dragonError_t dragon_memory_attr_init(dragonMemoryPoolAttr_t * attr) nogil dragonError_t dragon_memory_attr_destroy(dragonMemoryPoolAttr_t * attr) nogil - dragonError_t dragon_memory_pool_descr_clone(dragonMemoryPoolDescr_t * newpool_descr, - const dragonMemoryPoolDescr_t * oldpool_descr) nogil bool dragon_memory_pool_is_local(dragonMemoryPoolDescr_t * pool_descr) nogil + dragonError_t dragon_memory_pool_muid(dragonMemoryPoolDescr_t* pool_descr, dragonULInt* muid) nogil + dragonError_t dragon_memory_pool_get_free_size(dragonMemoryPoolDescr_t* pool_descr, uint64_t* free_size) nogil + dragonError_t dragon_memory_pool_get_total_size(dragonMemoryPoolDescr_t* pool_descr, uint64_t* total_size) nogil + dragonError_t dragon_memory_pool_get_utilization_pct(dragonMemoryPoolDescr_t* pool_descr, double* utilization_pct) nogil + dragonError_t dragon_memory_pool_get_rt_uid(dragonMemoryPoolDescr_t * pool_descr, dragonULInt *rt_uid) nogil + dragonError_t dragon_memory_pool_muid(dragonMemoryPoolDescr_t* pool_descr, dragonULInt* muid) nogil + dragonError_t dragon_memory_pool_get_free_size(dragonMemoryPoolDescr_t* pool_descr, uint64_t* free_size) nogil + dragonError_t dragon_memory_pool_get_utilization_pct(dragonMemoryPoolDescr_t* pool_descr, double* utilization_pct) nogil # Memory allocation actions dragonError_t dragon_memory_alloc(dragonMemoryDescr_t * mem_descr, dragonMemoryPoolDescr_t * pool_descr, size_t bytes) nogil @@ -124,7 +130,10 @@ cdef extern from "": dragonError_t dragon_memory_get_pointer(dragonMemoryDescr_t * mem_descr, void ** ptr) nogil dragonError_t dragon_memory_get_size(dragonMemoryDescr_t * mem_descr, size_t * bytes) nogil dragonError_t dragon_memory_descr_clone(dragonMemoryDescr_t * newmem_descr, const dragonMemoryDescr_t * oldmem_descr, ptrdiff_t offset, size_t * custom_length) nogil - + dragonError_t dragon_memory_hash(dragonMemoryDescr_t* mem_descr, dragonULInt* hash_value) nogil + dragonError_t dragon_memory_equal(dragonMemoryDescr_t* mem_descr1, dragonMemoryDescr_t* mem_descr2, bool* result) nogil + dragonError_t dragon_memory_copy(dragonMemoryDescr_t* from_mem, dragonMemoryDescr_t* to_mem, dragonMemoryPoolDescr_t* to_pool, const timespec_t* timeout) + dragonError_t dragon_memory_get_pool(const dragonMemoryDescr_t * mem_descr, dragonMemoryPoolDescr_t * pool_descr) nogil cdef extern from "": @@ -269,6 +278,12 @@ cdef extern from "": dragonError_t dragon_channel_attr_destroy(dragonChannelAttr_t * attr) nogil dragonError_t dragon_channel_get_attr(const dragonChannelDescr_t * ch, dragonChannelAttr_t * attr) nogil + # Message Attributes + dragonError_t dragon_channel_message_attr_init(dragonMessageAttr_t* attr) nogil + dragonError_t dragon_channel_message_attr_destroy(dragonMessageAttr_t* attr) nogil + dragonError_t dragon_channel_message_getattr(const dragonMessage_t* msg, dragonMessageAttr_t* attr) nogil + dragonError_t dragon_channel_message_setattr(dragonMessage_t* msg, const dragonMessageAttr_t* attr) nogil + # Messages dragonError_t dragon_channel_message_init(dragonMessage_t * msg, dragonMemoryDescr_t * mem_descr, const dragonMessageAttr_t * mattrs) nogil dragonError_t dragon_channel_message_get_mem(const dragonMessage_t * msg, dragonMemoryDescr_t * mem_descr) nogil @@ -276,6 +291,7 @@ cdef extern from "": dragonError_t dragon_channel_message_count(const dragonChannelDescr_t * ch, uint64_t * count) nogil dragonError_t dragon_channel_barrier_waiters(const dragonChannelDescr_t* ch, uint64_t* count) nogil dragonError_t dragon_channel_blocked_receivers(const dragonChannelDescr_t* ch, uint64_t* count) nogil + bool dragon_channel_barrier_is_broken(const dragonChannelDescr_t* ch) nogil # Send handle @@ -306,6 +322,9 @@ cdef extern from "": dragonError_t dragon_channel_gatewaymessage_transport_event_cmplt(dragonGatewayMessage_t * gmsg, const dragonULInt event_result, const dragonError_t op_err) nogil dragonError_t dragon_channel_gatewaymessage_destroy(dragonGatewayMessage_t * gmsg) nogil + # Utility functions + dragonError_t dragon_create_process_local_channel(dragonChannelDescr_t* ch, const timespec_t* timeout) nogil + cdef extern from "": ctypedef struct dragonChannelSetAttrs_t: int num_allowed_spin_waiters @@ -430,11 +449,19 @@ cdef extern from "_heap_manager.h": dragonError_t dragon_heap_dump_to_fd(FILE * fd, const char * title, const dragonDynHeap_t * heap) cdef extern from "": + dragonULInt dragon_get_local_rt_uid() dragonError_t dragon_set_procname(char * name) - char * dragon_base64_encode(uint8_t *data, size_t input_length, size_t *output_length) - uint8_t * dragon_base64_decode(const char *data, size_t input_length, size_t *output_length) + char * dragon_base64_encode(uint8_t *data, size_t input_length) + uint8_t * dragon_base64_decode(const char *data, size_t *output_length) dragonError_t dragon_timespec_deadline(timespec_t * timer, timespec_t * deadline) dragonError_t dragon_timespec_remaining(timespec_t * end_time, timespec_t * remaining_timeout) + dragonULInt dragon_hash(void* ptr, size_t num_bytes) + bool dragon_bytes_equal(void* ptr1, void* ptr2, size_t ptr1_numbytes, size_t ptr2_numbytes) + dragonError_t dragon_ls_set_kv(const unsigned char* key, const unsigned char* value, const timespec_t* timeout) nogil + dragonError_t dragon_ls_get_kv(const unsigned char* key, char** value, const timespec_t* timeout) nogil + +cdef extern from "": + size_t strlen(const char *s) cdef extern from "logging.h": @@ -606,6 +633,8 @@ cdef extern from "dragon/fli.h": dragonError_t dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t* pool, dragonFLIDescr_t* adapter) nogil dragonError_t dragon_fli_detach(dragonFLIDescr_t* adapter) nogil + dragonError_t dragon_fli_get_available_streams(dragonFLIDescr_t* adapter, uint64_t* num_streams, const timespec_t* timeout) nogil + dragonError_t dragon_fli_is_buffered(const dragonFLIDescr_t* adapter, bool* is_buffered) nogil dragonError_t dragon_fli_open_send_handle(const dragonFLIDescr_t* adapter, dragonFLISendHandleDescr_t* send_handle, dragonChannelDescr_t* strm_ch, const timespec_t* timeout) nogil dragonError_t dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, @@ -613,6 +642,7 @@ cdef extern from "dragon/fli.h": dragonError_t dragon_fli_open_recv_handle(const dragonFLIDescr_t* adapter, dragonFLIRecvHandleDescr_t* recv_handle, dragonChannelDescr_t* strm_ch, const timespec_t* timeout) nogil dragonError_t dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const timespec_t* timeout) nogil + dragonError_t dragon_fli_stream_received(dragonFLIRecvHandleDescr_t* recv_handle, bool* stream_received) dragonError_t dragon_fli_create_writable_fd(dragonFLISendHandleDescr_t* send_handle, int* fd_ptr, const bool buffer, size_t chunk_size, const uint64_t arg, const timespec_t* timeout) nogil dragonError_t dragon_fli_finalize_writable_fd(dragonFLISendHandleDescr_t* send_handle) nogil @@ -622,7 +652,7 @@ cdef extern from "dragon/fli.h": dragonError_t dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, uint8_t* bytes, uint64_t arg, const bool buffer, const timespec_t* timeout) nogil dragonError_t dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t* mem, - uint64_t arg, const timespec_t* timeout) nogil + uint64_t arg, bool transfer_ownership, const timespec_t* timeout) nogil dragonError_t dragon_fli_recv_bytes(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, size_t* received_size, uint8_t** bytes, uint64_t* arg, const timespec_t* timeout) nogil diff --git a/src/dragon/globalservices/api_setup.py b/src/dragon/globalservices/api_setup.py index dbdffc1..fc9fd75 100644 --- a/src/dragon/globalservices/api_setup.py +++ b/src/dragon/globalservices/api_setup.py @@ -299,7 +299,7 @@ def test_connection_override(test_gs_input=None, test_gs_return=None, _SHEP_RETURN = _connect_shep_return() -def connect_to_infrastructure(): +def connect_to_infrastructure(force=False): global _GS_INPUT global _GS_RETURN global _SHEP_INPUT @@ -311,10 +311,11 @@ def connect_to_infrastructure(): # variables. This is done first to enable # off-node communication from this process # as soon as possible. - + if force: + dp.reload_this_process() with _GS_API_LOCK: - if _INFRASTRUCTURE_CONNECTED: + if not force and _INFRASTRUCTURE_CONNECTED: return LOG.info(f'We are registering gateways for this process. {dp.this_process.num_gw_channels_per_node=}') @@ -328,6 +329,9 @@ def connect_to_infrastructure(): _SHEP_RETURN = _connect_shep_return() _INFRASTRUCTURE_CONNECTED = True + if force: + return + LOG.debug('waiting for handshake') global _ARG_PAYLOAD diff --git a/src/dragon/globalservices/group.py b/src/dragon/globalservices/group.py index ee4fb2d..bb2ea57 100644 --- a/src/dragon/globalservices/group.py +++ b/src/dragon/globalservices/group.py @@ -77,7 +77,7 @@ def kill(identifier, sig=signal.SIGKILL): :raises GroupError: if there is no such group :raises GroupError: if the group has not yet started :raises NotImplementedError: if any other case not implemented - :return: Nothing if successful + :return: GSGroupKillResponse.desc """ if isinstance(identifier, str): req_msg = dmsg.GSGroupKill(tag=das.next_tag(), p_uid=this_process.my_puid, @@ -90,7 +90,6 @@ def kill(identifier, sig=signal.SIGKILL): reply_msg = das.gs_request(req_msg) assert isinstance(reply_msg, dmsg.GSGroupKillResponse) - ec = dmsg.GSGroupKillResponse.Errors if reply_msg.err in {ec.SUCCESS, ec.ALREADY}: @@ -314,4 +313,4 @@ def query(identifier): if dmsg.GSGroupQueryResponse.Errors.SUCCESS == reply_msg.err: return reply_msg.desc else: - raise GroupError(f'group query {req_msg} failed: {reply_msg.err_info}') + raise GroupError(f'group query {req_msg} failed: {reply_msg.err_info}') \ No newline at end of file diff --git a/src/dragon/globalservices/group_int.py b/src/dragon/globalservices/group_int.py index 9715400..9502bbf 100644 --- a/src/dragon/globalservices/group_int.py +++ b/src/dragon/globalservices/group_int.py @@ -1,6 +1,9 @@ """Global services' internal Group context.. """ +from collections import defaultdict +from typing import Dict, List + from .process_int import ProcessContext from .channel_int import ChannelContext from .pool_int import PoolContext @@ -15,6 +18,7 @@ from .group import GroupError from .policy_eval import PolicyEvaluator +import os import logging import copy import signal @@ -54,10 +58,8 @@ def __init__(self, items, layout_list): self.control_port = dfacts.DEFAULT_PMI_CONTROL_PORT + self.job_id self.pid_base_map = {} # key h_uid, value base value - LOG.debug('Before allocating pid base _PMI_PID_DICT=%s', str(PMIJobHelper._PMI_PID_DICT)) for h_uid, lranks in self.ppn_map.items(): self.pid_base_map[h_uid] = PMIJobHelper.allocate_pmi_pid_base(h_uid, lranks) - LOG.debug('After allocating pid base _PMI_PID_DICT=%s', str(PMIJobHelper._PMI_PID_DICT)) @classmethod def is_pmi_required(cls, items:list[tuple]): @@ -187,16 +189,10 @@ def __next__(self): if self._index < self.nranks: lrank = self.lrank_list[self._index] h_uid = self.get_host_id_from_index(self._index) - pmi_info = dmsg.PMIInfo( - job_id=self.job_id, + pmi_info = dmsg.PMIProcessInfo( lrank=lrank, ppn=self.ppn_map[h_uid], nid=self.pmi_h_uid_list.index(h_uid), - nnodes=self.pmi_nnodes, - nranks=self.nranks, - nidlist=self.nid_list, - hostlist=self.host_list, - control_port=self.control_port, pid_base=self.pid_base_map[h_uid], ) self._index += 1 @@ -234,8 +230,8 @@ def __init__(self, server, request, reply_channel, g_uid, policy, pmi_job_helper self.destroy_request = None self.pmi_job_helper = pmi_job_helper self.destroy_called = False - self.destroy_remove_success_ids = None # used when destroy_remove is called to keep the items to be - # destroyed after having received all the SHProcessKillResponse messages + self.destroy_remove_success_ids = None # used when destroy_remove is called to keep the items to be + # destroyed after having received all the SHProcessKillResponse messages self._descriptor = group_desc.GroupDescriptor(g_uid=g_uid, name=request.user_name, policy=policy) @@ -352,16 +348,29 @@ def construct(cls, server, msg, reply_channel): if not msg.user_name: msg.user_name = auto_name + ## This block maybe needs to change policy = msg.policy - assert isinstance(policy, Policy) # Policy Evaluator wants a list of policies, one for each member total_members = sum([n for n, _ in msg.items]) - policies = [policy] * total_members + # With discussion this can be cleaned up and either always be a list + # or policy merging logic for process level policies and PG level + # policies could happen here + if isinstance(policy, list): + for pol in policy: + assert isinstance(pol, Policy) + policies = policy + else: + #important because the policy does need to be a policy to evaluate it + assert isinstance(policy, Policy) + policies = [policy] * total_members + #policies = policy + LOG.debug('policies=%s', policies) # Gather our node list and evaluate the policies against it layout_list = server.policy_eval.evaluate(policies=policies) - LOG.debug('layout_list=%s', layout_list) + if int(os.environ.get('PMI_DEBUG', 0)): + LOG.info('layout_list=%s', layout_list) pmi_job_helper = None if PMIJobHelper.is_pmi_required(msg.items): @@ -376,13 +385,16 @@ def construct(cls, server, msg, reply_channel): server.group_table[this_guid] = group_context server.group_resource_count[this_guid] = [] # list of items corresponding to the multiplicity of each list in server.group_resource_list + # Maps a given node (local services instance) to a list of + # ProcessContexts that are to be created on that instance + ls_proccontext_map : Dict[int, List[ProcessContext]] = defaultdict(list) + # msg.items is a list of tuples of the form (count:int, create_msg: dragon.infrastructure.messages.Message) layout_index = 0 for tuple_idx, item in enumerate(msg.items): count, res_msg = item resource_msg = dmsg.parse(res_msg) - LOG.debug(f'Creating msg {resource_msg}') if count > 0: # initialize a new list group_context.descriptor.sets.append([]) @@ -393,39 +405,45 @@ def construct(cls, server, msg, reply_channel): layout_index += 1 # we need a unique copy of the msg for each member - msg = copy.deepcopy(resource_msg) - msg.tag = das.next_tag() + resource_copy = copy.deepcopy(resource_msg) + resource_copy.tag = das.next_tag() # create a unique name for this resource based on the tuple user_name - if msg.user_name: - msg.user_name = f'{msg.user_name}.{this_guid}.{tuple_idx}.{item_idx}' + if resource_copy.user_name: + resource_copy.user_name = f'{resource_copy.user_name}.{this_guid}.{tuple_idx}.{item_idx}' # update the layout to place the process correctly - msg.layout = item_layout + resource_copy.layout = item_layout # add a new resource into this list group_context.descriptor.sets[tuple_idx] += [group_desc.GroupDescriptor.GroupMember()] # this is where we're actually starting the creation of the group members if isinstance(resource_msg, dmsg.GSProcessCreate): - - if msg.pmi_required: + if resource_msg.pmi_required: # The PMIJobHelper generates the pmi_info structure # from the layout_map and list of group items. - msg._pmi_info = next(pmi_job_iter) - LOG.info(f'{msg._pmi_info=}') + resource_copy.pmi_required = True + resource_copy._pmi_info = next(pmi_job_iter) + if int(os.environ.get('PMI_DEBUG', 0)): + LOG.info('%s', resource_copy._pmi_info) - issued, outbound_tag, proc_context = ProcessContext.construct(server, msg, reply_channel, - head=False, send_msg=False, - belongs_to_group=True) + success, outbound_tag, proc_context = ProcessContext.construct(server, resource_copy, reply_channel, + head=False, send_msg=False, + belongs_to_group=True) server.resource_to_group_map[proc_context.descriptor.p_uid] = (this_guid, (tuple_idx, item_idx)) - if issued and outbound_tag and proc_context: + if success and outbound_tag and proc_context: + ls_proccontext_map[proc_context.node].append(proc_context) server.pending[outbound_tag] = group_context.complete_construction server.group_to_pending_resource_map[(outbound_tag, this_guid)] = proc_context + + # Update the group with the corresponding GroupMember + member = group_context._generate_member(this_guid, proc_context) + group_context._update_group_member(this_guid, member, tuple_idx, item_idx, related_to_create=False) else: - if proc_context and outbound_tag == 'already': # the process was already created + if proc_context and outbound_tag == 'already': # the process was already created LOG.debug(f"The process {proc_context.descriptor.p_uid} was already created.") member = {'state': proc_context.descriptor.state, 'uid': proc_context.descriptor.p_uid, @@ -441,7 +459,7 @@ def construct(cls, server, msg, reply_channel): # be cleared, a group pending construct_completion will be issued from process_int continue - else: # there was a failure + else: # there was a failure # in this case, outbound_tag contains the error message LOG.debug(f"There was a failure in the process {proc_context.descriptor.p_uid} creation: {outbound_tag}") member = group_context._generate_member(this_guid, proc_context, outbound_tag) @@ -461,6 +479,33 @@ def construct(cls, server, msg, reply_channel): else: raise GroupError('The Group should include at least one member in each subgroup.') + # If PMI is required, we need to send these common PMI options. + # By sending them as part of the SHMultiProcessCreate message, + # we limit the duplication of these common vaules in each embedded + # SHProcessCreate message, reducing the overall message size. + pmi_group_info : dmsg.PMIGroupInfo = None + if pmi_job_helper: + pmi_group_info : dmsg.PMIGroupInfo = dmsg.PMIGroupInfo( + job_id=pmi_job_helper.job_id, + nnodes=pmi_job_helper.pmi_nnodes, + nranks=pmi_job_helper.nranks, + nidlist=pmi_job_helper.nid_list, + hostlist=pmi_job_helper.host_list, + control_port=pmi_job_helper.control_port + ) + + for node, contexts in ls_proccontext_map.items(): + procs = [context.shprocesscreate_msg for context in contexts] + shep_req = dmsg.SHMultiProcessCreate( + tag=server.tag_inc(), + r_c_uid=dfacts.GS_INPUT_CUID, + pmi_group_info=pmi_group_info, + procs=procs + ) + shep_hdl = server.shep_inputs[node] + server.pending_sends.put((shep_hdl, shep_req.serialize())) + LOG.debug(f'request %s to shep %d', shep_req, node) + return True def _construction_helper(self, msg): @@ -502,6 +547,14 @@ def _construction_helper(self, msg): else: raise RuntimeError(f'got {msg!s} err {msg.err} unknown') + # Before updating, make sure GS hasn't already marked this process as dead via out-of-order + # exit of the proc + cur_state = self.server.group_table[guid].descriptor.sets[lst_idx][item_idx].state + if cur_state is process_desc.ProcessDescriptor.State.DEAD: + LOG.debug(f'guid: {guid}, puid {member["uid"]} was already dead prior to create response') + member['state'] = cur_state + member['desc'].state = cur_state + # Update the group with the corresponding GroupMember self._update_group_member(guid, member, lst_idx, item_idx) @@ -732,11 +785,16 @@ def create_add(server, msg, reply_channel): # Gather our node list and evaluate the policies against it layout_list = server.policy_eval.evaluate(policies=policies) - LOG.debug('layout_list=%s', layout_list) + if int(os.environ.get('PMI_DEBUG', 0)): + LOG.debug('layout_list=%s', layout_list) # we need the number of existing lists in the group existing_lists = len(groupdesc.sets) + # Maps a given node (local services instance) to a list of + # ProcessContexts that are to be created on that instance + ls_proccontext_map : Dict[int, List[ProcessContext]] = defaultdict(list) + # msg.items is a list of tuples of the form (count:int, create_msg: dragon.infrastructure.messages.Message) layout_index = 0 for tuple_idx, item in enumerate(msg.items): @@ -754,32 +812,34 @@ def create_add(server, msg, reply_channel): layout_index += 1 # we need a unique copy of the msg for each member - msg = copy.deepcopy(resource_msg) + resource_copy = copy.deepcopy(resource_msg) + resource_copy.tag = das.next_tag() # create a unique name for this resource based on the tuple user_name - if msg.user_name: - msg.user_name = f'{msg.user_name}.{target_uid}.{tuple_idx+existing_lists}.{item_idx}' + if resource_copy.user_name: + resource_copy.user_name = f'{resource_copy.user_name}.{target_uid}.{tuple_idx+existing_lists}.{item_idx}' # update the layout to place the process correctly - msg.layout = item_layout + resource_copy.layout = item_layout # add a new resource into this list groupdesc.sets[tuple_idx+existing_lists] += [group_desc.GroupDescriptor.GroupMember()] # this is where we're actually starting the creation of the group members if isinstance(resource_msg, dmsg.GSProcessCreate): - issued, outbound_tag, proc_context = ProcessContext.construct(server, msg, reply_channel, - head=False, send_msg=False, - belongs_to_group=True, - addition=True) + success, outbound_tag, proc_context = ProcessContext.construct(server, resource_copy, reply_channel, + head=False, send_msg=False, + belongs_to_group=True, + addition=True) server.resource_to_group_map[proc_context.descriptor.p_uid] = (target_uid, (tuple_idx+existing_lists, item_idx)) - if issued and outbound_tag and proc_context: + if success and outbound_tag and proc_context: + ls_proccontext_map[proc_context.node].append(proc_context) server.pending[outbound_tag] = groupctx.complete_addition server.group_to_pending_resource_map[(outbound_tag, target_uid)] = proc_context else: - if proc_context and outbound_tag == 'already': # the process was already created + if proc_context and outbound_tag == 'already': # the process was already created member = {'state': proc_context.descriptor.state, 'uid': proc_context.descriptor.p_uid, 'placement': proc_context.descriptor.node, @@ -811,7 +871,17 @@ def create_add(server, msg, reply_channel): raise GroupError(f'Unknown msg type {resource_msg} for a Group member.') else: raise GroupError('The Group should include at least one member in each subgroup.') - return True + + for node, contexts in ls_proccontext_map.items(): + procs = [context.shprocesscreate_msg for context in contexts] + shep_req = dmsg.SHMultiProcessCreate( + tag=server.tag_inc(), + r_c_uid=dfacts.GS_INPUT_CUID, + procs=procs + ) + shep_hdl = server.shep_inputs[node] + server.pending_sends.put((shep_hdl, shep_req.serialize())) + LOG.debug(f'request %s to shep %d', shep_req, node) else: raise NotImplementedError('close case') diff --git a/src/dragon/globalservices/policy_eval.py b/src/dragon/globalservices/policy_eval.py index dbcd559..cdb17b0 100644 --- a/src/dragon/globalservices/policy_eval.py +++ b/src/dragon/globalservices/policy_eval.py @@ -2,6 +2,10 @@ from dataclasses import dataclass, field, fields from ..infrastructure.policy import Policy from ..infrastructure.node_desc import NodeDescriptor +import logging + + +LOG = logging.getLogger('policy_eval:') @dataclass class ResourceLayout: @@ -62,7 +66,7 @@ def _add_layout(self, node, cpu_affinity, gpu_affinity, env_str, layouts): """ # NOTE: Numa node and accelerator are placeholders numa_node = 0 - layouts.append( ResourceLayout(node.h_uid, node.host_name, numa_node, + layouts.append( ResourceLayout(node.h_uid, node.host_name, numa_node, cpu_affinity, gpu_affinity, env_str) ) node.num_policies += 1 @@ -147,7 +151,7 @@ def _get_cpu_affinity(self, p : Policy, node : NodeDescriptor) -> list[int]: """ Generate a list of available devices the policy can be applied to for the given Node """ - + if p.cpu_affinity: # List not empty, assume SPECIFIC affinity affinity = [x for x in node.cpu_devices if x in p.cpu_affinity] return affinity # This covers both "ANY" and "SPECIFIC" if a specific list is given @@ -156,16 +160,20 @@ def _get_cpu_affinity(self, p : Policy, node : NodeDescriptor) -> list[int]: return node.cpu_devices return [] - + def _get_gpu_affinity(self, p : Policy, node : NodeDescriptor) -> list[int]: + #LOG.debug(f'{node=}') if p.gpu_affinity: + #LOG.debug(f'{node.accelerators=}') + assert(isinstance(p.gpu_affinity, list)) affinity = [x for x in node.accelerators.device_list if x in p.gpu_affinity] return affinity - + if p.affinity == Policy.Affinity.ANY and node.accelerators is not None: + #LOG.debug(f'{node.accelerators=}') return node.accelerators.device_list - + return [] def evaluate(self, policies : list[Policy]=None) -> list[ResourceLayout]: @@ -179,7 +187,9 @@ def evaluate(self, policies : list[Policy]=None) -> list[ResourceLayout]: for p in policies: # Merge incoming policies against the self.default_policy so any DEFAULT enums get replaced with the default policy option p = self.merge(self.default_policy, p) + #LOG.debug(f'Merged policy {p=}') node = self._get_node(p) # Get a node based on policy (if requesting specific nodes, may raise exception) + #LOG.debug(f'Node {node=}') cpu_affinity = self._get_cpu_affinity(p, node) # Get affinity based on policy gpu_affinity = self._get_gpu_affinity(p, node) env_str = "" # Environment string for setting accelerator affinity @@ -187,6 +197,7 @@ def evaluate(self, policies : list[Policy]=None) -> list[ResourceLayout]: env_str = node.accelerators.env_str self._add_layout(node, cpu_affinity, gpu_affinity, env_str, layouts) + #LOG.debug(f'{layouts=}') return layouts @staticmethod diff --git a/src/dragon/globalservices/process.py b/src/dragon/globalservices/process.py index 120abfa..02e3a51 100644 --- a/src/dragon/globalservices/process.py +++ b/src/dragon/globalservices/process.py @@ -7,6 +7,7 @@ import threading import atexit from distutils.util import strtobool +from typing import List, Tuple, Dict from .. import channels as dch @@ -33,6 +34,7 @@ class StreamDestination(enum.Enum): STDOUT = 1 STDERR = 2 + _capture_stdout_conn = None _capture_stderr_conn = None _capture_stdout_chan = None @@ -42,11 +44,13 @@ class StreamDestination(enum.Enum): DRAGON_STOP_CAPTURING_MP_CHILD_OUTPUT = 'DRAGON_STOP_CAPTURING_MP_CHILD_OUTPUT' _capture_shutting_down = False + # This is for user processes to use when they wish to capture the # output of child multiprocessing processes. def start_capturing_child_mp_output(): os.environ[DRAGON_CAPTURE_MP_CHILD_OUTPUT] = 'True' + def stop_capturing_child_mp_output(): global _capture_shutting_down, _capture_stdout_chan, _capture_stderr_chan @@ -67,7 +71,7 @@ def stop_capturing_child_mp_output(): dgchan.destroy(_capture_stdout_chan.cuid) log.info('stdout channel destroy complete') _capture_stdout_chan = None - except: + except Exception: pass if _capture_stderr_chan is not None: @@ -76,7 +80,7 @@ def stop_capturing_child_mp_output(): dgchan.destroy(_capture_stderr_chan.cuid) log.info('stderr channel destroy complete') _capture_stderr_chan = None - except: + except Exception: pass @@ -228,7 +232,7 @@ def get_create_message(exe, run_dir, args, env, user_name='', options=None, def get_create_message_with_argdata(exe, run_dir, args, env, argdata=None, user_name='', options=None, stdin=None, stdout=None, stderr=None, group=None, - user=None, umask=- 1, pipesize=- 1, pmi_required=False): + user=None, umask=- 1, pipesize=- 1, pmi_required=False, policy=None): """Return a GSProcessCreate object with starting args. This is an extension of the 'get_create_message' method that encapsulates our scheme for getting @@ -283,7 +287,7 @@ def get_create_message_with_argdata(exe, run_dir, args, env, argdata=None, user_ rundir=run_dir, user_name=user_name, options=options, stdin=stdin, stdout=stdout, stderr=stderr, group=group, user=user, umask=umask, - pipesize=pipesize, pmi_required=pmi_required) + pipesize=pipesize, pmi_required=pmi_required, policy=policy) if len(argdata) <= dfacts.ARG_IMMEDIATE_LIMIT: # deliver arguments in the argdata directly. @@ -298,7 +302,7 @@ def get_create_message_with_argdata(exe, run_dir, args, env, argdata=None, user_ rundir=run_dir, user_name=user_name, options=options, stdin=stdin, stdout=stdout, stderr=stderr, group=group, user=user, umask=umask, - pipesize=pipesize, pmi_required=pmi_required) + pipesize=pipesize, pmi_required=pmi_required, policy=policy) else: raise NotImplementedError(f"Argument data larger than {dfacts.ARG_IMMEDIATE_LIMIT} bytes is not supported at the moment.") @@ -367,6 +371,7 @@ def create(exe, run_dir, args, env, user_name='', options=None, soft=False, reply_msg = das.gs_request(req_msg) log.debug('got GSProcessCreateResponse') + assert isinstance(reply_msg, dmsg.GSProcessCreateResponse) ec = dmsg.GSProcessCreateResponse.Errors @@ -392,7 +397,7 @@ def prepare_argdata_for_immediate(argdata): def create_with_argdata(exe, run_dir, args, env, argdata=None, user_name='', options=None, soft=False, pmi_required=False, - stdin=None, stdout=None, stderr=None): + stdin=None, stdout=None, stderr=None, policy=None): """Asks Global services to create a new process and deliver starting args to it thru messaging. This is an extension of the 'create' method that encapsulates our scheme for getting @@ -435,7 +440,8 @@ def create_with_argdata(exe, run_dir, args, env, argdata=None, user_name='', pmi_required=pmi_required, stdin=stdin, stdout=stdout, - stderr=stderr) + stderr=stderr, + policy=policy) elif len(argdata) <= dfacts.ARG_IMMEDIATE_LIMIT: # deliver arguments in the argdata directly. @@ -454,7 +460,8 @@ def create_with_argdata(exe, run_dir, args, env, argdata=None, user_name='', pmi_required=pmi_required, stdin=stdin, stdout=stdout, - stderr=stderr) + stderr=stderr, + policy=policy) else: # TODO: capture these comments in documentation # Here we don't set argdata to anything at all. @@ -481,7 +488,8 @@ def create_with_argdata(exe, run_dir, args, env, argdata=None, user_name='', pmi_required=pmi_required, stdin=stdin, stdout=stdout, - stderr=stderr) + stderr=stderr, + policy=policy) # Another transaction to gs but we are only here if # we are sending a lot of data. Could be returned with create call @@ -627,21 +635,71 @@ def join(identifier, timeout=None): raise ProcessError(f'process join {req_msg} failed: {reply_msg.err_info}') -def multi_join(identifiers, timeout=None, join_all=False): +def get_multi_join_success_puids(statuses: Dict[str, List[int]]) -> Tuple[List[Tuple[int, int]], bool]: + """Go through list of processes that have been joined on to isolate successful zero exits + + :param statuses: Dict of puid keys pointing to error status and exit codes of multi_join function + :type statuses: Dict[str, List[int, int]] + :returns: List comprised of puids and exit code tuples, if exit code was zero. And whether + there was a timeout + :rtype: Tuple[List[Tuple[int, int]], bool] + """ + + ec = dmsg.GSProcessJoinListResponse.Errors # get the error codes + success_list = [] + timeout_flag = False + for t_p_uid, status_info in statuses.items(): + if status_info[0] == ec.SUCCESS.value: + success_list.append((int(t_p_uid), status_info[1])) + elif status_info[0] == ec.TIMEOUT.value: + timeout_flag = True + return success_list, timeout_flag + + +def get_multi_join_failure_puids(statuses: Dict[str, List[int]]) -> Tuple[List[Tuple[int, int]], bool]: + """Go through list of processes that have been joined on to isolate non-zero exits + + :param statuses: Dict of puid keys pointing to error status and exit codes of multi_join function + :type statuses: Dict[str, List[int, int]] + :returns: Tuple made up of List comprised of puids and exit code tuples, if exit was no-zero. + And whether there was a timeout + :rtype: Tuple[List[Tuple[int, int]], bool] + """ + ec = dmsg.GSProcessJoinListResponse.Errors # get the error codes + failure_list = [] + timeout_flag = False + for t_p_uid, status_info in statuses.items(): + # Look for non-zero exit codes + if status_info[1] not in [0, None]: + failure_list.append((int(t_p_uid), status_info[1])) + elif status_info[0] == ec.TIMEOUT.value: + timeout_flag = True + return failure_list, timeout_flag + + +def multi_join(identifiers: List[int or str], + timeout: bool = None, + join_all: bool = False, + return_on_bad_exit: bool = False) -> Tuple[List[Tuple[int, int]], Dict]: """Asks Global Services to join a list of specified managed processes. If join_all is False, it returns when 'any' process has exited or there is a timeout. If join_all is True, it returns when 'all' processes have exited or there is a timeout. :param identifiers: list of process identifiers indicating p_uid (int) or process name (string) - :param timeout: Timeout in seconds for max time to wait. None = default, infinite wait - :param join_all: indicates whether we need to wait on all processes in the list or not - :return: If join_all is False, return a list of tuples (p_uid, unix_exit_code) for any processes exited, - or None if none exited and there is a timeout, along with a dictionary with the status of each process. - If join_all is True, return a list of tuples (p_uid, unix_exit_code) when all processes exited, - or None if none exited and there is a timeout or some exited and some errored/timed out, - along with a dictionary with the status of each process. - :raises: ProcessError if there is no such process or some other error has occurred. + :type identifiers: List[int, str] + :param timeout: Timeout in seconds for max time to wait. defaults to None, infinite wait + :type timeout: bool, optional + :param join_all: indicates whether we need to wait on all processes in the list or not, defaults to False + :type join_all: bool, optional + :param return_on_bad_exit: If join_all is True, multi_join will still return if there was a + non-zero exit, defaults to False + :type return_on_bad_exit: bool, optional + :returns: If join_all is False, return a list of tuples (p_uid, unix_exit_code) for any processes exited, + or None if none exited and there is a timeout, along with a dictionary with the status of each process. + If join_all is True, return a list of tuples (p_uid, unix_exit_code) when all processes exited, + or None if none exited and there is a timeout or some exited and some errored/timed out, + along with a dictionary with the status of each process. """ if len(identifiers) == 0: @@ -665,35 +723,25 @@ def multi_join(identifiers, timeout=None, join_all=False): req_msg = dmsg.GSProcessJoinList(tag=das.next_tag(), p_uid=this_process.my_puid, r_c_uid=das.get_gs_ret_cuid(), timeout=msg_timeout, t_p_uid_list=puid_identifiers, user_name_list=name_identifiers, - join_all=join_all) + join_all=join_all, return_on_bad_exit=return_on_bad_exit) reply_msg = das.gs_request(req_msg) assert isinstance(reply_msg, dmsg.GSProcessJoinListResponse) - - ec = dmsg.GSProcessJoinListResponse.Errors # get the error codes - - success_list = [] - timeout_flag = False - for t_p_uid, status_info in reply_msg.puid_status.items(): - if status_info[0] == ec.SUCCESS.value: - success_list.append((int(t_p_uid), status_info[1])) - elif status_info[0] == ec.TIMEOUT.value: - if not timeout_flag: - timeout_flag = True + success_list, timeout_flag = get_multi_join_success_puids(reply_msg.puid_status) if success_list: - if join_all: # 'all' option + if join_all: # 'all' option # we want all the processes finished, otherwise return None if len(success_list) == len(identifiers): - return success_list, reply_msg.puid_status # also return the dict with the status of all processes - # in the list for future use outside the - # Connection.wait() context - else: # there is at least one process exited and at least one errored/timed out + return success_list, reply_msg.puid_status # also return the dict with the status of all processes + # in the list for future use outside the + # Connection.wait() context + else: # there is at least one process exited and at least one errored/timed out return None, reply_msg.puid_status - else: # 'any' option and at least one process exited + else: # 'any' option and at least one process exited return success_list, reply_msg.puid_status - elif timeout_flag: # none has exited and all timed out + elif timeout_flag: # none has exited and all timed out return None, reply_msg.puid_status else: log.debug(f'process join {req_msg} failed: {reply_msg.puid_status.items()}') diff --git a/src/dragon/globalservices/process_int.py b/src/dragon/globalservices/process_int.py index 9167a44..9278079 100644 --- a/src/dragon/globalservices/process_int.py +++ b/src/dragon/globalservices/process_int.py @@ -34,6 +34,7 @@ def __init__(self, *, server, request, reply_channel, p_uid, node): self.server = server self.request = request self.reply_channel = reply_channel + self.node = node self.exit_msg = None self.destroy_request = None self.gs_ret_channel_context = None @@ -77,6 +78,16 @@ def _mk_sh_proc_create(self, the_tag, which_node): else: capacity = self.request.pipesize + if self.request.options.make_inf_channels: + req_msg = dmsg.GSChannelCreate(tag=self.server.tag_inc(), p_uid=0, r_c_uid=0, + m_uid=dfacts.infrastructure_pool_muid_from_index(which_node)) + _, _, self.gs_ret_channel_context = channel_int.ChannelContext.construct(self.server, req_msg, fake_reply_channel, + node_override=self.process_parms.index, send_msg=False) + self.gs_ret_channel_context.incref() + gs_ret_chan_msg = self.gs_ret_channel_context.shchannelcreate_msg + else: + gs_ret_chan_msg = None + if self.request.stdin == dmsg.PIPE: puid = self.process_parms.my_puid @@ -138,7 +149,8 @@ def _mk_sh_proc_create(self, the_tag, which_node): stdout_msg=stdout_msg, stderr_msg=stderr_msg, pmi_info=self.request._pmi_info, - layout=self.request.layout) #pylint: disable=protected-access + layout=self.request.layout, + gs_ret_chan_msg=gs_ret_chan_msg) #pylint: disable=protected-access def mk_sh_proc_kill(self, the_tag, the_sig=signal.SIGKILL): return dmsg.SHProcessKill(tag=the_tag, @@ -210,62 +222,37 @@ def construct(cls, server, msg, reply_channel, head, send_msg=True, belongs_to_g outbound_tag = None - # start making the infrastructure channels if they are needed, - # otherwise just start the process directly. - if msg.options.make_inf_channels: - LOG.debug(f'making inf channels for {which_node}') - fake_reply_channel = dutil.AbsorbingChannel() - gsr_msg = dmsg.GSChannelCreate(tag=server.tag_inc(), p_uid=0, r_c_uid=0, - m_uid=dfacts.infrastructure_pool_muid_from_index(which_node)) - - issued, tag, chan_context = channel_int.ChannelContext.construct(server, gsr_msg, - fake_reply_channel) - - context.gs_ret_channel_context = chan_context - - if not issued: - LOG.info(f'failed creating gs ret channel for puid {this_puid}') - context.descriptor.state = process_desc.ProcessDescriptor.State.DEAD - err_msg = 'gs channel create fail' - if send_msg: - rm = dmsg.GSProcessCreateResponse(tag=server.tag_inc(), - ref=msg.tag, - err=dmsg.GSProcessCreateResponse.Errors.FAIL, - err_info=err_msg) - context.reply_channel.send(rm.serialize()) - return False, err_msg, context - else: - server.pending[tag] = context.check_channel_const - return False, None, context - else: - context.gs_ret_channel_context = None - outbound_tag = context.send_start() + context.gs_ret_channel_context = None + outbound_tag, context.shprocesscreate_msg = context.send_start(send_msg) - # if it does not belong to a group issue a pending completion now - if not belongs_to_group: - server.pending[outbound_tag] = context.complete_construction + # if it does not belong to a group issue a pending completion now + if not belongs_to_group: + server.pending[outbound_tag] = context.complete_construction - return True, outbound_tag, context + return True, outbound_tag, context - def send_start(self): - # send request to shep, remember pending process. - shep_hdl = self.server.shep_inputs[self.descriptor.node] + def send_start(self, send_msg): outbound_tag = self.server.tag_inc() shep_req = self._mk_sh_proc_create(outbound_tag, self._descriptor.node) - # In cases where we are sending a large amount of - # messages, such as with the GSGroupCreate handler, - # we can fill the GS Input Queue with responses and - # basically cause the GS / TA / LS to be unable to - # send/receive any messages. To prevent this, we'll - # enqueue pending sends and interleave sending and - # receiving messages to allow us to process responses - # on the input queue. + if send_msg: + # In cases where we are sending a large amount of + # messages, such as with the GSGroupCreate handler, + # we can fill the GS Input Queue with responses and + # basically cause the GS / TA / LS to be unable to + # send/receive any messages. To prevent this, we'll + # enqueue pending sends and interleave sending and + # receiving messages to allow us to process responses + # on the input queue. + + # send request to shep, remember pending process. + shep_hdl = self.server.shep_inputs[self.descriptor.node] + + self.server.pending_sends.put((shep_hdl, shep_req.serialize())) + LOG.debug(f'request {shep_req} to shep') - self.server.pending_sends.put((shep_hdl, shep_req.serialize())) - LOG.debug(f'request {shep_req} to shep') - return outbound_tag + return outbound_tag, shep_req def check_channel_const(self, msg): """This is the channel construction message - see if that worked and if so @@ -347,6 +334,21 @@ def complete_construction(self, msg, send_msg=True): if dmsg.SHProcessCreateResponse.Errors.SUCCESS == msg.err: self.descriptor.state = process_desc.ProcessDescriptor.State.ACTIVE + if self.gs_ret_channel_context is not None: + channel_constructed = self.gs_ret_channel_context.complete_construction(msg.gs_ret_chan_resp) + + if not channel_constructed: + err_msg = f'Failed to create GS return channel for {self.descriptor.p_uid}.' + LOG.info(err_msg) + self.descriptor.state = process_desc.ProcessDescriptor.State.DEAD + if send_msg: + rm = dmsg.GSProcessCreateResponse(tag=self.server.tag_inc(), + ref=self.request.tag, + err=dmsg.GSProcessCreateResponse.Errors.FAIL, + err_info=err_msg) + self.reply_channel.send(rm.serialize()) + return False + if self.stdin_context is not None: channel_constructed = self.stdin_context.complete_construction(msg.stdin_resp) diff --git a/src/dragon/globalservices/server.py b/src/dragon/globalservices/server.py index d5506bf..5a522a2 100644 --- a/src/dragon/globalservices/server.py +++ b/src/dragon/globalservices/server.py @@ -294,7 +294,7 @@ def choose_shepherd(self, msg:dmsg.GSProcessCreate): assert msg.layout.h_uid in self.node_table.keys() index = self.node_table[msg.layout.h_uid].ls_index - log.info(f"choose shepherd {index} for {msg}") + log.debug("choose shepherd %d for %s", index, msg) return index @@ -450,7 +450,8 @@ def do_timeouts(self): gspjlr = dmsg.GSProcessJoinListResponse timed_out = self.pending_join_list.get_timed_out() for join_request in timed_out: - log.debug(f'multi-join timed out: {join_request}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('multi-join timed out: %s', join_request) p_uid_list, req_msg = join_request # p_uid_list is a frozenset reply_channel = self.get_reply_handle(req_msg) for p_uid in p_uid_list: @@ -458,7 +459,6 @@ def do_timeouts(self): rm = gspjlr(tag=self.tag_inc(), ref=req_msg.tag, puid_status=self.puid_join_list_status[(req_msg.tag, req_msg.p_uid)]) reply_channel.send(rm.serialize()) self.pending_join_list.remove_one(p_uid_list, req_msg) - log.debug(f'timed out multi-join response to {req_msg!s}: {rm!s}') del self.puid_join_list_status[(req_msg.tag, req_msg.p_uid)] # we are done with this msg req gscjr = dmsg.GSChannelJoinResponse @@ -870,6 +870,7 @@ def handle_process_join_list(self, msg): # helper function which updates the puid_join_list_status dict with the status of each puid def proc_join_list_helper(target_uid, found, errmsg, success_num, item): + nonzero_exit = False if not found: self.puid_join_list_status[join_key][item] = (gspjlr.Errors.UNKNOWN.value, errmsg) log.debug(f'unknown target_uid: {item}') @@ -881,6 +882,9 @@ def proc_join_list_helper(target_uid, found, errmsg, success_num, item): pdesc = pctx.descriptor if process_desc.ProcessDescriptor.State.DEAD == pdesc.state: self.puid_join_list_status[join_key][target_uid] = (gspjlr.Errors.SUCCESS.value, pdesc.ecode) + if pdesc.ecode != 0: + nonzero_exit = True + log.debug(f'non-zero exit {pdesc.ecode} for target_uid: {target_uid}') success_num += 1 log.debug(f'process dead, target_uid: {target_uid}') else: @@ -889,26 +893,40 @@ def proc_join_list_helper(target_uid, found, errmsg, success_num, item): else: pending_puid.append(target_uid) self.puid_join_list_status[join_key][target_uid] = (gspjlr.Errors.PENDING.value, None) - return success_num + return success_num, nonzero_exit + + def continue_to_wait(join_all, success_num, pending_puid, return_on_bad_exit, nonzero_exit): + + # Always send a message if we are told to return on bad exit and have a non-zero exit code + if return_on_bad_exit and nonzero_exit: + return False + # Otherwise use the original logic + else: + due_to_join_all = bool(join_all and pending_puid) + due_to_partial_complete = bool(not success_num and pending_puid) + log.debug(f'due_to_join_all -> {due_to_join_all} | due_to_partial_complete -> {due_to_partial_complete}') + return due_to_partial_complete or due_to_join_all if msg.t_p_uid_list: for item in msg.t_p_uid_list: item = int(item) target_uid, found, errmsg = self.resolve_puid(None, item) - success_num = proc_join_list_helper(target_uid, found, errmsg, success_num, item) + success_num, nonzero_exit = proc_join_list_helper(target_uid, found, errmsg, success_num, item) if msg.user_name_list: for item in msg.user_name_list: target_uid, found, errmsg = self.resolve_puid(item, None) - success_num = proc_join_list_helper(target_uid, found, errmsg, success_num, item) + success_num, nonzero_exit = proc_join_list_helper(target_uid, found, errmsg, success_num, item) # Do not send a message if all the processes are still pending # if no exits no timeouts, or, 'all' option and some are pending (even though some may have exited) - if (not success_num and pending_puid) or (msg.join_all and pending_puid): + if continue_to_wait(msg.join_all, success_num, pending_puid, msg.return_on_bad_exit, nonzero_exit): self.pending_join_list.put(frozenset(pending_puid), msg, timeout=timeout) - log.debug(f'join stored for target_uid: {pending_puid} timeout {timeout}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('join stored for target_uid: %s timeout %s', pending_puid, timeout) else: rm = gspjlr(tag=self.tag_inc(), ref=msg.tag, puid_status=self.puid_join_list_status[join_key]) reply_channel.send(rm.serialize()) + log.debug('send join list response') del self.puid_join_list_status[join_key] @dutil.route(dmsg.SHProcessExit, DTBL) @@ -959,7 +977,8 @@ def handle_process_exit(self, msg): err=gspjr.Errors.SUCCESS, exit_code=ctx.descriptor.ecode) reply_channel.send(rm.serialize()) - log.debug(f'join response to {join_req}: {rm}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('join response to %s: %s', join_req, rm) self.pending_join.remove(msg.p_uid) to_be_erased = [] @@ -971,13 +990,15 @@ def handle_process_exit(self, msg): join_key = (join_req.tag, join_req.p_uid) self.puid_join_list_status[join_key][msg.p_uid] = (gspjlr.Errors.SUCCESS.value, ctx.descriptor.ecode) - # if this is multi_join on 'any' processes request, or, set with a single member - if not join_req.join_all or len(puid_set) == 1: + # if this is multi_join on 'any' processes request, or, set with a single member or there was a bad exit + # and a request to respond on it + if (not join_req.join_all or len(puid_set) == 1) or (join_req.return_on_bad_exit and ctx.descriptor.ecode != 0): reply_channel = self.get_reply_handle(join_req) rm = gspjlr(tag=self.tag_inc(), ref=join_req.tag, puid_status=self.puid_join_list_status[join_key]) reply_channel.send(rm.serialize()) - log.debug(f'join response to {join_req}: {rm}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('join response to %s: %s', join_req, rm) # we are done with this join request, so let's clean things up del self.puid_join_list_status[join_key] @@ -1167,10 +1188,12 @@ def handle_channel_list(self, msg): @dutil.route(dmsg.GSChannelQuery, DTBL) def handle_channel_query(self, msg): log = self._channel_logger - # log.debug(f'handling {msg}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('handling %s', msg) reply_channel = self.get_reply_handle(msg) - # log.debug(f'channel query to {msg.user_name} - {msg.c_uid}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('channel query to %s - %s', msg.user_name, msg.c_uid) target_uid, found, errmsg = self.resolve_cuid(msg.user_name, msg.c_uid) if not found: @@ -1189,10 +1212,12 @@ def handle_channel_query(self, msg): rm = dmsg.GSChannelQueryResponse(tag=self.tag_inc(), ref=msg.tag, err=dmsg.GSChannelQueryResponse.Errors.SUCCESS, desc=cdesc) - # log.debug(f'found descriptor: {cdesc}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('found descriptor: %s', cdesc) reply_channel.send(rm.serialize()) - # log.debug(f'response to {msg!s}: {rm!s}') + # TODO AICI-1422 Implement verbose logging options + # log.debug('response to %s: %s', msg, rm) @dutil.route(dmsg.GSChannelJoin, DTBL) @@ -1240,7 +1265,7 @@ def handle_channel_join(self, msg): log.debug(f'channel join stored for {target_uid}: {msg} timeout {the_timeout}') - @dutil.route(dmsg.GSDump, DTBL) + @dutil.route(dmsg.GSDumpState, DTBL) def handle_dump(self, msg): log = self._process_logger log.debug(f'dumping to {msg.filename}') @@ -1315,6 +1340,16 @@ def handle_node_query(self, msg): log.debug(f'response to {msg!s}: {rm!s}') + @dutil.route(dmsg.SHMultiProcessCreateResponse, DTBL) + def handle_sh_group_create_response(self, msg): + log = self._node_logger + log.debug('handling %s', msg) + + for proc_create_response in msg.responses: + log.info("proc_create_response=%s", str(proc_create_response)) + self.handle_process_create_response(proc_create_response) + + @dutil.route(dmsg.GSGroupCreate, DTBL) def handle_group_create(self, msg): log = self._group_logger diff --git a/src/dragon/infrastructure/connection.py b/src/dragon/infrastructure/connection.py index 3a97fd9..2604861 100644 --- a/src/dragon/infrastructure/connection.py +++ b/src/dragon/infrastructure/connection.py @@ -328,7 +328,14 @@ def recv(self): self.read_adapter.advance_raw_header(msg_len) buf = bytearray(msg_len) self.read_adapter.readinto(memoryview(buf)) - obj = pickle.loads(buf) + # The following try-except is here so we can receive unpickled raw data through + # this recv and it is there only as long as we have infrastructure processes + # sending unpickled data through channels while it is being received here. This + # presently occurs when sending messages from C/C++ to infrastructure components. + try: + obj = pickle.loads(buf) + except pickle.UnpicklingError: + return buf else: obj = pickle.load(self.read_adapter) except EOFError: diff --git a/src/dragon/infrastructure/facts.py b/src/dragon/infrastructure/facts.py index f022181..2d88aaa 100644 --- a/src/dragon/infrastructure/facts.py +++ b/src/dragon/infrastructure/facts.py @@ -3,6 +3,7 @@ import os import shlex import enum +import socket import sys from .. import dtypes @@ -61,6 +62,13 @@ def env_name(parameter_name: str) -> str: NATIVE = 'NATIVE' DEFAULT_POOL = NATIVE +# This number is set so that cuids can be divided between local +# services instances for the purpose of doling out process local +# channels. This allows a little more than 16 million nodes in the +# dragon run-time instance. +MAX_NODES_POW = 24 +MAX_NODES = 2 ** MAX_NODES_POW + #: Input channel unique ID for Global Services head GS_INPUT_CUID = 2 @@ -101,10 +109,12 @@ def env_name(parameter_name: str) -> str: #: Range for the local transport agent channel ID's (cuid) RANGE_TA_CUID = 2 ** 55 -#: Starting value of the local services shepherd channel unique ID (cuid) -BASE_SHEP_CUID = 2 ** 56 -#: Range for the local services channel ID's (cuid) -RANGE_SHEP_CUID = 2 ** 56 +#: Starting value for the backend channel ID (cuid) that +#: communicates with the frontend +BASE_BE_FE_CUID = 2 ** 56 +#: Range for the backend channel ID's (cuid) that +#: communicate with the frontend +RANGE_BE_FE_CUID = 2 ** 56 #: Starting value for the local launcher backend channel unique ID (cuid) BASE_BE_CUID = 2 ** 57 @@ -135,12 +145,11 @@ def env_name(parameter_name: str) -> str: #: communicate with its transport agent RANGE_BE_LOCAL_CUID = 2 ** 61 -#: Starting value for the backend channel ID (cuid) that -#: communicates with the frontend -BASE_BE_FE_CUID = 2 ** 62 -#: Range for the backend channel ID's (cuid) that -#: communicate with the frontend -RANGE_BE_FE_CUID = 2 ** 62 +#: Starting value of the local services shepherd channel unique ID (cuid) +SHEP_CUID_POW = 62 +BASE_SHEP_CUID = 2 ** SHEP_CUID_POW +#: Range for the local services channel ID's (cuid) +RANGE_SHEP_CUID = 2 ** SHEP_CUID_POW #: Starting value of the User created channel ID (cuid) FIRST_CUID = 2 ** 63 @@ -555,6 +564,8 @@ def index_from_default_pool_muid(m_uid:int) -> int: PROCNAME_LS, PROCNAME_TCP_TA, PROCNAME_OVERLAY_TA, + PROCNAME_OOB_TA, + PROCNAME_RDMA_TA, console_script_args) # Aliases for transport agent commands. Transport agent commands are always @@ -619,8 +630,32 @@ def from_str(s): DEFAULT_OVERLAY_NETWORK_PORT = 6565 DEFAULT_FRONTEND_PORT = 6566 DEFAULT_PMI_CONTROL_PORT = 8575 +DEFAULT_OOB_PORT = 9575 DEFAULT_PORT_RANGE=1000 +def port_check(ip_port): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.bind(ip_port) + except Exception: + if s is not None: + s.close() + return False + else: + s.close() + return True + +def get_port(min_port, port_range): + host = socket.gethostname() + max_port = min_port + port_range + + for port in range(min_port, max_port): + if port_check((host, port)): + return port + +# Port used for out-of-band communication +OOB_PORT = get_port(DEFAULT_OOB_PORT, DEFAULT_PORT_RANGE) + # GS_DEFAULT_POLICY -- To prevent circular imports, this lives in policy_eval.py -DEFAULT_NET_CONF_CACHE = os.path.join(os.getcwd(), ".dragon-net-conf") +DEFAULT_NET_CONF_CACHE = os.path.join(os.getcwd(), ".dragon-net-conf") \ No newline at end of file diff --git a/src/dragon/infrastructure/gpu_desc.py b/src/dragon/infrastructure/gpu_desc.py index 90809cb..875b895 100644 --- a/src/dragon/infrastructure/gpu_desc.py +++ b/src/dragon/infrastructure/gpu_desc.py @@ -55,7 +55,7 @@ def find_accelerators() -> AcceleratorDescriptor: devices = find_nvidia() if devices is not None: acc = AcceleratorDescriptor(vendor=AccVendor.NVIDIA, - device_list=list(range(len(devices)-1)), + device_list=list(range(len(devices))), env_str=AccEnvStr.NVIDIA ) return acc @@ -85,7 +85,7 @@ def find_accelerators() -> AcceleratorDescriptor: n_devices += 1 if n_devices > 0: - devices.device_list = list(range(n_devices-1)) + devices.device_list = list(range(n_devices)) else: return None diff --git a/src/dragon/infrastructure/group_desc.py b/src/dragon/infrastructure/group_desc.py index 72eecad..d674ac3 100644 --- a/src/dragon/infrastructure/group_desc.py +++ b/src/dragon/infrastructure/group_desc.py @@ -32,12 +32,12 @@ class Errors(enum.Enum): FAIL = 1 #: Resource was not created ALREADY = 2 #: Resource exists already - state : object = None - uid : int = None - placement : int = None - desc : object = None - error_code : int = None - error_info : str = None + state: object = None + uid: int = None + placement: int = None + desc: object = None + error_code: int = None + error_info: str = None @classmethod def from_sdict(cls, d): @@ -79,16 +79,21 @@ def get_sdict(self): return rv - state : State = State.PENDING - g_uid : int = None - name : str = None - sets : list(list()) = field(default_factory=list) - policy : Policy = None + state: State = State.PENDING + g_uid: int = None + name: str = None + sets: list(list()) = field(default_factory=list) + policy: Policy = None def __post_init__(self): if type(self.policy) is dict: self.policy = Policy.from_sdict(self.policy) + if type(self.policy) is list: + modded_args = [(i, Policy.from_sdict(p)) for i, p in enumerate(self.policy) if isinstance(p, dict)] + for i, p in modded_args: + self.policy[i] = p + if self.sets: old_sets, self.sets = self.sets, [] for i, lst in enumerate(old_sets): @@ -136,6 +141,8 @@ def get_sdict(self): if isinstance(self.policy, Policy): rv['policy'] = self.policy.get_sdict() + elif isinstance(self.policy, list): + rv['policy'] = [policy.get_sdict() for policy in self.policy] else: rv['policy'] = self.policy diff --git a/src/dragon/infrastructure/messages.py b/src/dragon/infrastructure/messages.py index 820eab0..a8670c4 100644 --- a/src/dragon/infrastructure/messages.py +++ b/src/dragon/infrastructure/messages.py @@ -1,14 +1,19 @@ """Dragon infrastructure messages are the internal API used for service communication. """ +import sys import enum import json import zlib -import base64 import subprocess -from typing import Optional, Union +import traceback +from typing import Dict, List, Optional, Union from dataclasses import dataclass, asdict +import ctypes +from contextlib import contextmanager,redirect_stderr,redirect_stdout +from os import devnull + from ..infrastructure import channel_desc from ..infrastructure import pool_desc from ..infrastructure.node_desc import NodeDescriptor @@ -16,311 +21,1512 @@ from ..infrastructure import parameters as parms from ..infrastructure import facts as dfacts from ..localservices import options as dso -from ..infrastructure.util import to_str_iter -from ..utils import B64 +from ..infrastructure.util import get_external_ip_addr, to_str_iter +from ..utils import B64, b64encode, b64decode +from ..rc import DragonError +import capnp +from ..infrastructure import message_defs_capnp as capnp_schema + +from ..globalservices.policy_eval import ResourceLayout, Policy + +from ..infrastructure import group_desc +# from ..infrastructure.policy import DefaultPolicy + + + +INT_NONE = 0 - 0X80000000 + +# This enum class lists the type codes in infrastructure +# messages. The values are significant for interoperability. + +@enum.unique +class MessageTypes(enum.Enum): + """ + These are the enumerated values of message type identifiers within + the Dragon infrastructure messages. + """ + DRAGON_MSG = 0 #: Deliberately invalid + GS_PROCESS_CREATE = enum.auto() #: + GS_PROCESS_CREATE_RESPONSE = enum.auto() #: + GS_PROCESS_LIST = enum.auto() #: + GS_PROCESS_LIST_RESPONSE = enum.auto() #: + GS_PROCESS_QUERY = enum.auto() #: + GS_PROCESS_QUERY_RESPONSE = enum.auto() #: + GS_PROCESS_KILL = enum.auto() #: + GS_PROCESS_KILL_RESPONSE = enum.auto() #: + GS_PROCESS_JOIN = enum.auto() #: + GS_PROCESS_JOIN_RESPONSE = enum.auto() #: + GS_CHANNEL_CREATE = enum.auto() #: + GS_CHANNEL_CREATE_RESPONSE = enum.auto() #: + GS_CHANNEL_LIST = enum.auto() #: + GS_CHANNEL_LIST_RESPONSE = enum.auto() #: + GS_CHANNEL_QUERY = enum.auto() #: + GS_CHANNEL_QUERY_RESPONSE = enum.auto() #: + GS_CHANNEL_DESTROY = enum.auto() #: + GS_CHANNEL_DESTROY_RESPONSE = enum.auto() #: + GS_CHANNEL_JOIN = enum.auto() #: + GS_CHANNEL_JOIN_RESPONSE = enum.auto() #: + GS_CHANNEL_DETACH = enum.auto() #: + GS_CHANNEL_DETACH_RESPONSE = enum.auto() #: + GS_CHANNEL_GET_SENDH = enum.auto() #: + GS_CHANNEL_GET_SENDH_RESPONSE = enum.auto() #: + GS_CHANNEL_GET_RECVH = enum.auto() #: + GS_CHANNEL_GET_RECVH_RESPONSE = enum.auto() #: + ABNORMAL_TERMINATION = enum.auto() #: + GS_STARTED = enum.auto() #: + GS_PING_SH = enum.auto() #: + GS_IS_UP = enum.auto() #: + GS_HEAD_EXIT = enum.auto() #: + GS_CHANNEL_RELEASE = enum.auto() #: + GS_HALTED = enum.auto() #: + SH_PROCESS_CREATE = enum.auto() #: + SH_PROCESS_CREATE_RESPONSE = enum.auto() #: + SH_MULTI_PROCESS_CREATE = enum.auto() #: + SH_MULTI_PROCESS_CREATE_RESPONSE = enum.auto() #: + SH_PROCESS_KILL = enum.auto() #: + SH_PROCESS_EXIT = enum.auto() #: + SH_CHANNEL_CREATE = enum.auto() #: + SH_CHANNEL_CREATE_RESPONSE = enum.auto() #: + SH_CHANNEL_DESTROY = enum.auto() #: + SH_CHANNEL_DESTROY_RESPONSE = enum.auto() #: + SH_LOCK_CHANNEL = enum.auto() #: + SH_LOCK_CHANNEL_RESPONSE = enum.auto() #: + SH_ALLOC_MSG = enum.auto() #: + SH_ALLOC_MSG_RESPONSE = enum.auto() #: + SH_ALLOC_BLOCK = enum.auto() #: + SH_ALLOC_BLOCK_RESPONSE = enum.auto() #: + SH_CHANNELS_UP = enum.auto() #: + SH_PING_GS = enum.auto() #: + SH_HALTED = enum.auto() #: + SH_FWD_INPUT = enum.auto() #: + SH_FWD_INPUT_ERR = enum.auto() #: + SH_FWD_OUTPUT = enum.auto() #: + GS_TEARDOWN = enum.auto() #: + SH_TEARDOWN = enum.auto() #: + SH_PING_BE = enum.auto() #: + BE_PING_SH = enum.auto() #: + TA_PING_SH = enum.auto() #: + SH_HALT_TA = enum.auto() #: + TA_HALTED = enum.auto() #: + SH_HALT_BE = enum.auto() #: + BE_HALTED = enum.auto() #: + TA_UP = enum.auto() #: + GS_PING_PROC = enum.auto() #: + GS_DUMP_STATE = enum.auto() #: + SH_DUMP_STATE = enum.auto() #: + LA_BROADCAST = enum.auto() #: + LA_PASS_THRU_FB = enum.auto() #: + LA_PASS_THRU_BF = enum.auto() #: + GS_POOL_CREATE = enum.auto() #: + GS_POOL_CREATE_RESPONSE = enum.auto() #: + GS_POOL_DESTROY = enum.auto() #: + GS_POOL_DESTROY_RESPONSE = enum.auto() #: + GS_POOL_LIST = enum.auto() #: + GS_POOL_LIST_RESPONSE = enum.auto() #: + GS_POOL_QUERY = enum.auto() #: + GS_POOL_QUERY_RESPONSE = enum.auto() #: + SH_POOL_CREATE = enum.auto() #: + SH_POOL_CREATE_RESPONSE = enum.auto() #: + SH_POOL_DESTROY = enum.auto() #: + SH_POOL_DESTROY_RESPONSE = enum.auto() #: + SH_CREATE_PROCESS_LOCAL_CHANNEL = enum.auto() #: + SH_CREATE_PROCESS_LOCAL_CHANNEL_RESPONSE = enum.auto() #: + SH_PUSH_KVL = enum.auto() #: + SH_PUSH_KVL_RESPONSE = enum.auto() #: + SH_POP_KVL = enum.auto() #: + SH_POP_KVL_RESPONSE = enum.auto() #: + SH_GET_KVL = enum.auto() #: + SH_GET_KVL_RESPONSE = enum.auto() #: + SH_SET_KV = enum.auto() #: + SH_SET_KV_RESPONSE = enum.auto() + SH_GET_KV = enum.auto() + SH_GET_KV_RESPONSE = enum.auto() + SH_EXEC_MEM_REQUEST = enum.auto() #: + SH_EXEC_MEM_RESPONSE = enum.auto() #: + GS_UNEXPECTED = enum.auto() #: + LA_SERVER_MODE = enum.auto() #: + LA_SERVER_MODE_EXIT = enum.auto() #: + LA_PROCESS_DICT = enum.auto() #: + LA_PROCESS_DICT_RESPONSE = enum.auto() #: + LA_DUMP_STATE = enum.auto() #: + BE_NODE_IDX_SH = enum.auto() #: + LA_CHANNELS_INFO = enum.auto() #: + SH_PROCESS_KILL_RESPONSE = enum.auto() #: + BREAKPOINT = enum.auto() #: + GS_PROCESS_JOIN_LIST = enum.auto() #: + GS_PROCESS_JOIN_LIST_RESPONSE = enum.auto() #: + GS_NODE_QUERY = enum.auto() #: + GS_NODE_QUERY_RESPONSE = enum.auto() #: + LOGGING_MSG = enum.auto() #: + LOGGING_MSG_LIST = enum.auto() #: + LOG_FLUSHED = enum.auto() #: + GS_NODE_LIST = enum.auto() #: + GS_NODE_LIST_RESPONSE = enum.auto() #: + GS_NODE_QUERY_TOTAL_CPU_COUNT = enum.auto() #: + GS_NODE_QUERY_TOTAL_CPU_COUNT_RESPONSE = enum.auto() #: + BE_IS_UP = enum.auto() #: + FE_NODE_IDX_BE = enum.auto() #: + HALT_OVERLAY = enum.auto() #: + HALT_LOGGING_INFRA = enum.auto() #: + OVERLAY_PING_BE = enum.auto() #: + OVERLAY_PING_LA = enum.auto() #: + LA_HALT_OVERLAY = enum.auto() #: + BE_HALT_OVERLAY = enum.auto() #: + OVERLAY_HALTED = enum.auto() #: + EXCEPTIONLESS_ABORT = enum.auto() #: Communicate abnormal termination without raising exception + LA_EXIT = enum.auto() #: + GS_GROUP_LIST = enum.auto() #: + GS_GROUP_LIST_RESPONSE = enum.auto() #: + GS_GROUP_QUERY = enum.auto() #: + GS_GROUP_QUERY_RESPONSE = enum.auto() #: + GS_GROUP_DESTROY = enum.auto() #: + GS_GROUP_DESTROY_RESPONSE = enum.auto() #: + GS_GROUP_ADD_TO = enum.auto() #: + GS_GROUP_ADD_TO_RESPONSE = enum.auto() #: + GS_GROUP_REMOVE_FROM = enum.auto() #: + GS_GROUP_REMOVE_FROM_RESPONSE = enum.auto() #: + GS_GROUP_CREATE = enum.auto() #: + GS_GROUP_CREATE_RESPONSE = enum.auto() #: + GS_GROUP_KILL = enum.auto() #: + GS_GROUP_KILL_RESPONSE = enum.auto() #: + GS_GROUP_CREATE_ADD_TO = enum.auto() #: + GS_GROUP_CREATE_ADD_TO_RESPONSE = enum.auto() #: + GS_GROUP_DESTROY_REMOVE_FROM = enum.auto() #: + GS_GROUP_DESTROY_REMOVE_FROM_RESPONSE = enum.auto() #: + TA_UPDATE_NODES = enum.auto() #: + RUNTIME_DESC = enum.auto() #: + USER_HALT_OOB = enum.auto() #: + DD_REGISTER_CLIENT = enum.auto() #: + DD_REGISTER_CLIENT_RESPONSE = enum.auto() #: + DD_DESTROY = enum.auto() #: + DD_DESTROY_RESPONSE = enum.auto() #: + DD_REGISTER_MANAGER = enum.auto() #: + DD_REGISTER_MANAGER_RESPONSE = enum.auto() #: + DD_REGISTER_CLIENT_ID = enum.auto() #: + DD_REGISTER_CLIENT_ID_RESPONSE = enum.auto() #: + DD_DESTROY_MANAGER = enum.auto() #: + DD_DESTROY_MANAGER_RESPONSE = enum.auto() #: + DD_PUT = enum.auto() #: + DD_PUT_RESPONSE = enum.auto() #: + DD_GET = enum.auto() #: + DD_GET_RESPONSE = enum.auto() #: + DD_POP = enum.auto() #: + DD_POP_RESPONSE = enum.auto() #: + DD_CONTAINS = enum.auto() #: + DD_CONTAINS_RESPONSE = enum.auto() #: + DD_GET_LENGTH = enum.auto() #: + DD_GET_LENGTH_RESPONSE = enum.auto() #: + DD_CLEAR = enum.auto() #: + DD_CLEAR_RESPONSE = enum.auto() #: + DD_GET_ITERATOR = enum.auto() #: + DD_GET_ITERATOR_RESPONSE = enum.auto() #: + DD_ITERATOR_NEXT = enum.auto() #: + DD_ITERATOR_NEXT_RESPONSE = enum.auto() #: + DD_KEYS = enum.auto() #: + DD_KEYS_RESPONSE = enum.auto() #: + DD_DEREGISTER_CLIENT = enum.auto() #: + DD_DEREGISTER_CLIENT_RESPONSE = enum.auto() #: + DD_CREATE = enum.auto() #: + DD_CREATE_RESPONSE = enum.auto() #: + DD_CONNECT_TO_MANAGER = enum.auto() #: + DD_CONNECT_TO_MANAGER_RESPONSE = enum.auto() #: + DD_GET_RANDOM_MANAGER = enum.auto() #: + DD_GET_RANDOM_MANAGER_RESPONSE = enum.auto() #: + + +@enum.unique +class FileDescriptor(enum.Enum): + stdin = 0 + stdout = 1 + stderr = 2 + + +PIPE = subprocess.PIPE +STDOUT = subprocess.STDOUT +DEVNULL = subprocess.DEVNULL + + +class AbnormalTerminationError(Exception): + + def __init__(self, msg=''): + self._msg = msg + + def __str__(self): + return f'{self._msg}' + + def __repr__(self): + return f"{str(__class__)}({repr(self._msg)})" + + +@dataclass +class PMIGroupInfo(): + """ + Required information to enable the launching of pmi based applications. + """ + + job_id: int + nnodes: int + nranks: int + nidlist: list[int] + hostlist: list[str] + control_port: int + + @classmethod + def fromdict(cls, d): + try: + return cls(**d) + except Exception as exc: + raise ValueError(f'Error deserializing {cls.__name__} {d=}') from exc + + +@dataclass +class PMIProcessInfo(): + """ + Required information to enable the launching of pmi based applications. + """ + + lrank: int + ppn: int + nid: int + pid_base: int + + @classmethod + def fromdict(cls, d): + try: + return cls(**d) + except Exception as exc: + raise ValueError(f'Error deserializing {cls.__name__} {d=}') from exc + + +class InfraMsg(object): + """Common base for all messages. + + This common base type for all messages sets up the + default fields and the serialization strategy for + now. + """ + + _tc = MessageTypes.DRAGON_MSG # deliberately invalid value, overridden + + @enum.unique + class Errors(enum.Enum): + INVALID = -1 # deliberately invalid, overridden + + def __init__(self, tag, ref=None, err=None): + assert isinstance(tag, int) + + self._tag = int(tag) + + if ref is None: + self._ref = None + else: + self._ref = int(ref) + + if err is not None: + if isinstance(err, self.Errors): + self._err = err + elif isinstance(err, int): + self._err = self.Errors(err) + else: + raise NotImplementedError('invalid error parameter') + else: + self._err = err + + def get_sdict(self): + + rv = {'_tc': self._tc.value, + 'tag': self.tag} + + if self.err is not None: + rv['err'] = self.err.value + + if self.ref is not None: + assert isinstance(self.ref, int) + rv['ref'] = self.ref + + return rv + + @property + def tc(self): + return self._tc + + @classmethod + def tcv(cls): + return cls._tc.value + + @property + def tag(self): + return self._tag + + @tag.setter + def tag(self, value): + self._tag = value + + @property + def ref(self): + return self._ref + + @property + def err(self): + return self._err + + # the keys in the serialization dictionary must match up + # with the arguments in the __init__ constructor + # for all the subclasses + @classmethod + def from_sdict(cls, sdict): + return cls(**sdict) + + @classmethod + def deserialize(cls, msg): + raise ValueError('Called deserialize on InfraMsg base class which should not happen.') + + def uncompressed_serialize(self): + return json.dumps(self.get_sdict()) + + def serialize(self): + return b64encode(zlib.compress(json.dumps(self.get_sdict()).encode('utf-8'))) + + def __str__(self): + cn = self.__class__.__name__ + msg = f'{cn}: {self.tag}' + if hasattr(self, 'p_uid'): + msg += f' {self.p_uid}' + + if hasattr(self, 'r_c_uid'): + msg += f'->{self.r_c_uid}' + return msg + + def __repr__(self): + fields_to_set = self.get_sdict() + del fields_to_set['_tc'] + fs = ', '.join([f'{k!s}={v!r}' for k, v in fields_to_set.items()]) + return f'{self.__class__.__name__}({fs})' + + +class CapNProtoMsg: + """Common base for all capnproto messages. + + This common base type for all messages sets up the + default fields and the serialization strategy for + messages to be exchanged between C and Python. + """ + Errors = DragonError + + _tc = MessageTypes.DRAGON_MSG # deliberately invalid value, overridden + + def __init__(self, tag): + self._tag = tag + + @classmethod + def from_sdict(cls, sdict): + return cls(**sdict) + + @classmethod + def deserialize(cls, msg_str): + msg = capnp_schema.MessageDef.from_bytes_packed(msg_str) + sdict = msg.to_dict() + flattened_dict = {} + typecode = sdict['tc'] + del sdict['tc'] + tag = sdict['tag'] + del sdict['tag'] + if 'value' in sdict['responseOption']: + flattened_dict.update(sdict['responseOption']['value']) + del sdict['responseOption'] + for msg_type in sdict: + for field in sdict[msg_type]: + flattened_dict[field] = sdict[msg_type][field] + flattened_dict['tag'] = tag + if 'none' in flattened_dict: + del flattened_dict['none'] + + return mt_dispatch[typecode].from_sdict(flattened_dict) + + def serialize(self): + cap_msg = self.builder() + return cap_msg.to_bytes_packed() + + def get_sdict(self): + rv = {'_tc': self._tc.value, + 'tag': self.tag} + return rv + + def builder(self): + cap_msg = capnp_schema.MessageDef.new_message() + cap_msg.tc = self._tc.value + cap_msg.tag = self._tag + return cap_msg + + def __repr__(self): + fields_to_set = self.get_sdict() + del fields_to_set['_tc'] + fs = ', '.join([f'{k!s}={v!r}' for k, v in fields_to_set.items()]) + return f'{self.__class__.__name__}({fs})' + + @property + def capnp_name(self): + name = self.__class__.__name__ + return name[:2].lower()+name[2:] + + @property + def tc(self): + return self._tc + + @property + def tag(self): + return self._tag + +class CapNProtoResponseMsg(CapNProtoMsg): + """Common base for all capnproto response messages. + + This provides some support for code common + to all response messages. + """ + def __init__(self, tag, ref, err, errInfo): + super().__init__(tag) + self._ref = ref + self._err = err + self._errInfo = errInfo + + def get_sdict(self): + rv = super().get_sdict() + rv['ref'] = self._ref + rv['err'] = self._err + rv['errInfo'] = self._errInfo + return rv + + def builder(self): + cap_msg = super().builder() + resp_msg = cap_msg.init('responseOption').init('value') + resp_msg.ref = self._ref + resp_msg.err = DragonError(self._err).value + resp_msg.errInfo = self._errInfo + return cap_msg + + @property + def ref(self): + return self._ref + + @property + def err(self): + return self._err + + @property + def errInfo(self): + return self._errInfo + + +class SHCreateProcessLocalChannel(CapNProtoMsg): + + _tc = MessageTypes.SH_CREATE_PROCESS_LOCAL_CHANNEL + + def __init__(self, tag, puid, respFLI): + super().__init__(tag) + self._puid = puid + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['puid'] = self._puid + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.puid = self._puid + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def respFLI(self): + return self._respFLI + + @property + def puid(self): + return self._puid + +class SHCreateProcessLocalChannelResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.SH_CREATE_PROCESS_LOCAL_CHANNEL_RESPONSE + + def __init__(self, tag, ref, err, errInfo='', serChannel=''): + super().__init__(tag, ref, err, errInfo) + self._serChannel = serChannel + + def get_sdict(self): + rv = super().get_sdict() + rv['serChannel'] = self._serChannel + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.serChannel = self._serChannel + return cap_msg + + @property + def serialized_channel(self): + return self._serChannel + +class SHPushKVL(CapNProtoMsg): + _tc = MessageTypes.SH_PUSH_KVL + + def __init__(self, tag, key, value, respFLI): + super().__init__(tag) + self._key = key + self._value = value + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['key'] = self._key + rv['value'] = self._value + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.key = self._key + client_msg.value = self._value + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def key(self): + return self._key + + @property + def value(self): + return self._value + + @property + def respFLI(self): + return self._respFLI + +class SHPushKVLResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.SH_PUSH_KVL_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class SHPopKVL(CapNProtoMsg): + _tc = MessageTypes.SH_POP_KVL + + def __init__(self, tag, key, value, respFLI): + super().__init__(tag) + self._key = key + self._value = value + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['key'] = self._key + rv['value'] = self._value + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.key = self._key + client_msg.value = self._value + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def key(self): + return self._key + + @property + def value(self): + return self._value + + @property + def respFLI(self): + return self._respFLI + + +class SHPopKVLResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.SH_POP_KVL_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class SHGetKVL(CapNProtoMsg): + _tc = MessageTypes.SH_GET_KVL + + def __init__(self, tag, key, respFLI): + super().__init__(tag) + self._key = key + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['key'] = self._key + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.key = self._key + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def key(self): + return self._key + + @property + def respFLI(self): + return self._respFLI + + +class SHGetKVLResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.SH_GET_KVL_RESPONSE + + def __init__(self, tag, ref, err, errInfo='', values=[]): + super().__init__(tag, ref, err, errInfo) + self._values = values + + def get_sdict(self): + rv = super().get_sdict() + rv['values'] = self._values + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + values = client_msg.init('values', len(self._values)) + for i in range(len(self._values)): + values[i] = self._values[i] + return cap_msg + + @property + def values(self): + return self._values + +class SHSetKV(CapNProtoMsg): + _tc = MessageTypes.SH_SET_KV + + def __init__(self, tag, key, value, respFLI): + super().__init__(tag) + self._key = key + self._value = value + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['key'] = self._key + rv['value'] = self._value + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.key = self._key + client_msg.value = self._value + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def key(self): + return self._key + + @property + def value(self): + return self._value + + @property + def respFLI(self): + return self._respFLI + +class SHSetKVResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.SH_SET_KV_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + + +class SHGetKV(CapNProtoMsg): + _tc = MessageTypes.SH_GET_KV + + def __init__(self, tag, key, respFLI): + super().__init__(tag) + self._key = key + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['key'] = self._key + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.key = self._key + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def key(self): + return self._key + + @property + def respFLI(self): + return self._respFLI + + +class SHGetKVResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.SH_GET_KV_RESPONSE + + def __init__(self, tag, ref, err, errInfo='', value=None): + super().__init__(tag, ref, err, errInfo) + self._value = value + + def get_sdict(self): + rv = super().get_sdict() + rv['value'] = self._value + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.value = self._value + return cap_msg + + @property + def value(self): + return self._value + +class DDCreate(CapNProtoMsg): + + _tc = MessageTypes.DD_CREATE + + def __init__(self, tag, respFLI, args): + super().__init__(tag) + self._respFLI = respFLI + self._args = args + + def get_sdict(self): + rv = super().get_sdict() + rv['respFLI'] = self._respFLI + rv['args'] = self._args + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.respFLI = self._respFLI + client_msg.args = self._args + return cap_msg + + @property + def respFLI(self): + return self._respFLI + + @property + def args(self): + return self._args + +class DDCreateResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_CREATE_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDGetRandomManager(CapNProtoMsg): + + _tc = MessageTypes.DD_GET_RANDOM_MANAGER + + def __init__(self, tag, respFLI): + super().__init__(tag) + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def respFLI(self): + return self._respFLI + +class DDGetRandomManagerResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_GET_RANDOM_MANAGER_RESPONSE + + def __init__(self, tag, ref, err, manager, errInfo=''): + super().__init__(tag, ref, err, errInfo) + self._manager = manager + + def get_sdict(self): + rv = super().get_sdict() + rv['manager'] = self._manager + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.manager = self._manager + return cap_msg + + @property + def manager(self): + return self._manager + +class DDRegisterClient(CapNProtoMsg): + + _tc = MessageTypes.DD_REGISTER_CLIENT + + def __init__(self, tag, respFLI, bufferedRespFLI): + super().__init__(tag) + self._respFLI = respFLI + self._bufferedRespFLI = bufferedRespFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['respFLI'] = self._respFLI + rv['bufferedRespFLI'] = self._bufferedRespFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.respFLI = self._respFLI + client_msg.bufferedRespFLI = self._bufferedRespFLI + return cap_msg + + @property + def respFLI(self): + return self._respFLI + + @property + def bufferedRespFLI(self): + return self._bufferedRespFLI + +class DDRegisterClientResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_REGISTER_CLIENT_RESPONSE + + def __init__(self, tag, ref, err, clientID, numManagers, errInfo=''): + super().__init__(tag, ref, err, errInfo) + self._clientID = clientID + self._num_managers = numManagers + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + rv['numManagers'] = self._num_managers + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + client_msg.numManagers = self._num_managers + return cap_msg + + @property + def clientID(self): + return self._clientID + + @property + def numManagers(self): + return self._num_managers + +class DDConnectToManager(CapNProtoMsg): + + _tc = MessageTypes.DD_CONNECT_TO_MANAGER + + def __init__(self, tag, clientID, managerID): + super().__init__(tag) + self._clientID = clientID + self._managerID = managerID + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + rv['managerID'] = self._managerID + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + client_msg.managerID = self._managerID + return cap_msg + + @property + def clientID(self): + return self._clientID + + @property + def managerID(self): + return self._managerID + +class DDConnectToManagerResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_CONNECT_TO_MANAGER_RESPONSE + + def __init__(self, tag, ref, err, manager, errInfo=''): + super().__init__(tag, ref, err, errInfo) + self._manager = manager + + def get_sdict(self): + rv = super().get_sdict() + rv['manager'] = self._manager + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.manager = self._manager + return cap_msg + + @property + def manager(self): + return self._manager + +class DDDestroy(CapNProtoMsg): + + _tc = MessageTypes.DD_DESTROY + + def __init__(self, tag, clientID, respFLI): + super().__init__(tag) + self._clientID = clientID + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + client_msg.respFLI = self.respFLI + return cap_msg + + @property + def respFLI(self): + return self._respFLI + + @property + def clientID(self): + return self._clientID + +class DDDestroyResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_DESTROY_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDRegisterManager(CapNProtoMsg): + + _tc = MessageTypes.DD_REGISTER_MANAGER + + def __init__(self, tag, mainFLI, respFLI): + super().__init__(tag) + self._mainFLI = mainFLI + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['mainFLI'] = self._mainFLI + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.mainFLI = self._mainFLI + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def mainFLI(self): + return self._mainFLI + + @property + def respFLI(self): + return self._respFLI + +class DDRegisterManagerResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_REGISTER_MANAGER_RESPONSE + + def __init__(self, tag, ref, err, managerID, errInfo='', managers=[]): + super().__init__(tag, ref, err, errInfo) + self._managers = managers + self._managerID = managerID + + def get_sdict(self): + rv = super().get_sdict() + rv['managers'] = self._managers + rv['managerID'] = self._managerID + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.managerID = self._managerID + msg_mgrs = client_msg.init('managers', len(self._managers)) + for i in range(len(self._managers)): + msg_mgrs[i] = self._managers[i] + return cap_msg + + @property + def managerID(self): + return self._managerID + + @property + def managers(self): + return self._managers + +class DDRegisterClientID(CapNProtoMsg): + + _tc = MessageTypes.DD_REGISTER_CLIENT_ID + + def __init__(self, tag, clientID, respFLI, bufferedRespFLI): + super().__init__(tag) + self._clientID = clientID + self._respFLI = respFLI + self._bufferedRespFLI = bufferedRespFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + rv['respFLI'] = self._respFLI + rv['bufferedRespFLI'] = self._bufferedRespFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + client_msg.respFLI = self._respFLI + client_msg.bufferedRespFLI = self._bufferedRespFLI + return cap_msg + + @property + def clientID(self): + return self._clientID + + @property + def respFLI(self): + return self._respFLI + + @property + def bufferedRespFLI(self): + return self._bufferedRespFLI + +class DDRegisterClientIDResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_REGISTER_CLIENT_ID_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDDestroyManager(CapNProtoMsg): + + _tc = MessageTypes.DD_DESTROY_MANAGER + + def __init__(self, tag, respFLI): + super().__init__(tag) + self._respFLI = respFLI + + def get_sdict(self): + rv = super().get_sdict() + rv['respFLI'] = self._respFLI + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.respFLI = self._respFLI + return cap_msg + + @property + def respFLI(self): + return self._respFLI + +class DDDestroyManagerResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_DESTROY_MANAGER_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDPut(CapNProtoMsg): + + _tc = MessageTypes.DD_PUT + + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg + + @property + def clientID(self): + return self._clientID + +class DDPutResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_PUT_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDGet(CapNProtoMsg): + + _tc = MessageTypes.DD_GET + + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg + + @property + def clientID(self): + return self._clientID + +class DDGetResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_GET_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDPop(CapNProtoMsg): + + _tc = MessageTypes.DD_POP + + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg + + @property + def clientID(self): + return self._clientID + +class DDPopResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_POP_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDContains(CapNProtoMsg): + + _tc = MessageTypes.DD_CONTAINS + + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg + + @property + def clientID(self): + return self._clientID + +class DDContainsResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_CONTAINS_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDGetLength(CapNProtoMsg): + + _tc = MessageTypes.DD_GET_LENGTH + + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID + + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + return rv + + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg + + @property + def clientID(self): + return self._clientID + +class DDGetLengthResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_GET_LENGTH_RESPONSE + + def __init__(self, tag, ref, err, errInfo='', length=0): + super().__init__(tag, ref, err, errInfo) + self._length = length + + def get_sdict(self): + rv = super().get_sdict() + rv['length'] = self._length + return rv -from ..globalservices.policy_eval import ResourceLayout, Policy + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.length = self._length + return cap_msg -from ..infrastructure import group_desc -# from ..infrastructure.policy import DefaultPolicy + @property + def length(self): + return self._length +class DDClear(CapNProtoMsg): -# This enum class lists the type codes in infrastructure -# messages. The values are significant for interoperability. + _tc = MessageTypes.DD_CLEAR + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID -@enum.unique -class MessageTypes(enum.Enum): - """ - These are the enumerated values of message type identifiers within - the Dragon infrastructure messages. - """ - INVALID = 0 #: - GS_PROCESS_CREATE = 1 #: - GS_PROCESS_CREATE_RESPONSE = 2 #: - GS_PROCESS_LIST = 3 #: - GS_PROCESS_LIST_RESPONSE = 4 #: - GS_PROCESS_QUERY = 5 #: - GS_PROCESS_QUERY_RESPONSE = 6 #: - GS_PROCESS_KILL = 7 #: - GS_PROCESS_KILL_RESPONSE = 8 #: - GS_PROCESS_JOIN = 9 #: - GS_PROCESS_JOIN_RESPONSE = 10 #: - GS_CHANNEL_CREATE = 11 #: - GS_CHANNEL_CREATE_RESPONSE = 12 #: - GS_CHANNEL_LIST = 13 #: - GS_CHANNEL_LIST_RESPONSE = 14 #: - GS_CHANNEL_QUERY = 15 #: - GS_CHANNEL_QUERY_RESPONSE = 16 #: - GS_CHANNEL_DESTROY = 17 #: - GS_CHANNEL_DESTROY_RESPONSE = 18 #: - GS_CHANNEL_JOIN = 19 #: - GS_CHANNEL_JOIN_RESPONSE = 20 #: - GS_CHANNEL_DETACH = 21 #: - GS_CHANNEL_DETACH_RESPONSE = 22 #: - GS_CHANNEL_GET_SENDH = 23 #: - GS_CHANNEL_GET_SENDH_RESPONSE = 24 #: - GS_CHANNEL_GET_RECVH = 25 #: - GS_CHANNEL_GET_RECVH_RESPONSE = 26 #: - ABNORMAL_TERMINATION = 27 #: - GS_STARTED = 28 #: - GS_PING_SH = 29 #: - GS_IS_UP = 30 #: - GS_HEAD_EXIT = 31 #: - GS_CHANNEL_RELEASE = 32 #: - GS_HALTED = 33 #: - SH_PROCESS_CREATE = 34 #: - SH_PROCESS_CREATE_RESPONSE = 35 #: - SH_PROCESS_KILL = 36 #: - SH_PROCESS_EXIT = 37 #: - SH_CHANNEL_CREATE = 38 #: - SH_CHANNEL_CREATE_RESPONSE = 39 #: - SH_CHANNEL_DESTROY = 40 #: - SH_CHANNEL_DESTROY_RESPONSE = 41 #: - SH_LOCK_CHANNEL = 42 #: - SH_LOCK_CHANNEL_RESPONSE = 43 #: - SH_ALLOC_MSG = 44 #: - SH_ALLOC_MSG_RESPONSE = 45 #: - SH_ALLOC_BLOCK = 46 #: - SH_ALLOC_BLOCK_RESPONSE = 47 #: - SH_IS_UP = 48 #: - SH_CHANNELS_UP = 49 #: - SH_PING_GS = 50 #: - SH_HALTED = 51 #: - SH_FWD_INPUT = 52 #: - SH_FWD_INPUT_ERR = 53 #: - SH_FWD_OUTPUT = 54 #: - GS_TEARDOWN = 55 #: - SH_TEARDOWN = 56 #: - SH_PING_BE = 57 #: - BE_PING_SH = 58 #: - TA_PING_SH = 59 #: - SH_HALT_TA = 60 #: - TA_HALTED = 61 #: - SH_HALT_BE = 62 #: - BE_HALTED = 63 #: - TA_UP = 64 #: - GS_PING_PROC = 65 #: - GS_DUMP = 66 #: - SH_DUMP = 67 #: - LA_BROADCAST = 68 #: - LA_PASSTHRU_FB = 69 #: - LA_PASSTHRU_BF = 70 #: - GS_POOL_CREATE = 71 #: - GS_POOL_CREATE_RESPONSE = 72 #: - GS_POOL_DESTROY = 73 #: - GS_POOL_DESTROY_RESPONSE = 74 #: - GS_POOL_LIST = 75 #: - GS_POOL_LIST_RESPONSE = 76 #: - GS_POOL_QUERY = 77 #: - GS_POOL_QUERY_RESPONSE = 78 #: - SH_POOL_CREATE = 79 #: - SH_POOL_CREATE_RESPONSE = 80 #: - SH_POOL_DESTROY = 81 #: - SH_POOL_DESTROY_RESPONSE = 82 #: - SH_EXEC_MEM_REQUEST = 83 #: - SH_EXEC_MEM_RESPONSE = 84 #: - GS_UNEXPECTED = 85 #: - LA_SERVER_MODE = 86 #: - LA_SERVER_MODE_EXIT = 87 #: - LA_PROCESS_DICT = 88 #: - LA_PROCESS_DICT_RESPONSE = 89 #: - LA_DUMP = 90 #: - BE_NODEINDEX_SH = 91 #: - LA_CHANNELS_INFO = 92 #: - SH_PROCESS_KILL_RESPONSE = 93 #: - BREAKPOINT = 94 #: - GS_PROCESS_JOIN_LIST = 95 #: - GS_PROCESS_JOIN_LIST_RESPONSE = 96 #: - GS_NODE_QUERY = 97 #: - GS_NODE_QUERY_RESPONSE = 98 #: - LOGGING_MSG = 99 #: - LOGGING_MSG_LIST = 100 #: - LOG_FLUSHED = 101 #: - GS_NODE_LIST = 102 #: - GS_NODE_LIST_RESPONSE = 103 #: - GS_NODE_QUERY_TOTAL_CPU_COUNT = 104 #: - GS_NODE_QUERY_TOTAL_CPU_COUNT_RESPONSE = 105 #: - BE_IS_UP = 106 #: - FE_NODE_IDX_BE = 107 #: - HALT_OVERLAY = 108 #: - HALT_LOGGING_INFRA = 109 - OVERLAY_PING_BE = 110 #: - OVERLAY_PING_LA = 111 #: - LA_HALT_OVERLAY = 112 #: - BE_HALT_OVERLAY = 113 #: - OVERLAY_HALTED = 114 #: - EXCEPTIONLESS_ABORT = 115 #: Communicate abnormal termination without raising exception - LA_EXIT = 116 #: - GS_GROUP_LIST = 117 #: - GS_GROUP_LIST_RESPONSE = 118 #: - GS_GROUP_QUERY = 119 #: - GS_GROUP_QUERY_RESPONSE = 120 #: - GS_GROUP_DESTROY = 121 #: - GS_GROUP_DESTROY_RESPONSE = 122 #: - GS_GROUP_ADD_TO = 123 #: - GS_GROUP_ADD_TO_RESPONSE = 124 #: - GS_GROUP_REMOVE_FROM = 125 #: - GS_GROUP_REMOVE_FROM_RESPONSE = 126 #: - GS_GROUP_CREATE = 127 #: - GS_GROUP_CREATE_RESPONSE = 128 #: - GS_GROUP_KILL = 129 #: - GS_GROUP_KILL_RESPONSE = 130 #: - GS_GROUP_CREATE_ADD_TO = 131 #: - GS_GROUP_CREATE_ADD_TO_RESPONSE = 132 #: - GS_GROUP_DESTROY_REMOVE_FROM = 133 #: - GS_GROUP_DESTROY_REMOVE_FROM_RESPONSE = 134 #: - HSTA_UPDATE_NODES = 135 #: + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + return rv -@enum.unique -class FileDescriptor(enum.Enum): - stdin = 0 - stdout = 1 - stderr = 2 + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg + @property + def clientID(self): + return self._clientID -PIPE = subprocess.PIPE -STDOUT = subprocess.STDOUT -DEVNULL = subprocess.DEVNULL +class DDClearResponse(CapNProtoResponseMsg): + _tc = MessageTypes.DD_CLEAR_RESPONSE -class AbnormalTerminationError(Exception): + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) - def __init__(self, msg=''): - self._msg = msg +class DDGetIterator(CapNProtoMsg): - def __str__(self): - return f'{self._msg}' + _tc = MessageTypes.DD_GET_ITERATOR - def __repr__(self): - return f"{str(__class__)}({repr(self._msg)})" + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID -@dataclass -class PMIInfo(): - """ - Required information to enable the launching of pmi based applications. - """ + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + return rv - job_id: int - lrank: int - ppn: int - nid: int - nnodes: int - nranks: int - nidlist: list[int] - hostlist: list[str] - control_port: int - pid_base: int + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg - @classmethod - def fromdict(cls, d): - try: - return cls(**d) - except Exception as exc: - raise ValueError(f'Error deserializing {cls.__name__} {d=}') from exc + @property + def clientID(self): + return self._clientID +class DDGetIteratorResponse(CapNProtoResponseMsg): -class _MsgBase(object): - """Common base for all messages. + _tc = MessageTypes.DD_GET_ITERATOR_RESPONSE - This common base type for all messages sets up the - default fields and the serialization strategy for - now. - """ + def __init__(self, tag, ref, err, errInfo='', iterID=0): + super().__init__(tag, ref, err, errInfo) + self._iter_id = iterID - _tc = MessageTypes.INVALID # deliberately invalid value, overridden + def get_sdict(self): + rv = super().get_sdict() + rv['iterID'] = self._iter_id + return rv - @enum.unique - class Errors(enum.Enum): - INVALID = -1 # deliberately invalid, overridden + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.iterID = self._iter_id + return cap_msg - def __init__(self, tag, ref=None, err=None): - assert isinstance(tag, int) + @property + def iterID(self): + return self._iter_id - self._tag = int(tag) +class DDIteratorNext(CapNProtoMsg): - if ref is None: - self._ref = None - else: - self._ref = int(ref) + _tc = MessageTypes.DD_ITERATOR_NEXT - if err is not None: - if isinstance(err, self.Errors): - self._err = err - elif isinstance(err, int): - self._err = self.Errors(err) - else: - raise NotImplementedError('invalid error parameter') - else: - self._err = err + def __init__(self, tag, clientID, iterID): + super().__init__(tag) + self._clientID = clientID + self._iterID = iterID def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + rv['iterID'] = self._iterID + return rv - rv = {'_tc': self._tc.value, - 'tag': self.tag} + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + client_msg.iterID = self._iterID + return cap_msg - if self.err is not None: - rv['err'] = self.err.value + @property + def clientID(self): + return self._clientID - if self.ref is not None: - assert isinstance(self.ref, int) - rv['ref'] = self.ref + @property + def iterID(self): + return self._iterID + +class DDIteratorNextResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_ITERATOR_NEXT_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) + +class DDKeys(CapNProtoMsg): + + _tc = MessageTypes.DD_KEYS + + def __init__(self, tag, clientID): + super().__init__(tag) + self._clientID = clientID + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID return rv + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + return cap_msg + @property - def tc(self): - return self._tc + def clientID(self): + return self._clientID - @classmethod - def tcv(cls): - return cls._tc.value +class DDKeysResponse(CapNProtoResponseMsg): - @property - def tag(self): - return self._tag + _tc = MessageTypes.DD_KEYS_RESPONSE - @tag.setter - def tag(self, value): - self._tag = value + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) - @property - def ref(self): - return self._ref +class DDDeregisterClient(CapNProtoMsg): - @property - def err(self): - return self._err + _tc = MessageTypes.DD_DEREGISTER_CLIENT - # the keys in the serialization dictionary must match up - # with the arguments in the __init__ constructor - # for all the subclasses - @classmethod - def from_sdict(cls, sdict): - return cls(**sdict) + def __init__(self, tag, clientID, respFLI): + super().__init__(tag) + self._clientID = clientID + self._respFLI = respFLI - def uncompressed_serialize(self): - return json.dumps(self.get_sdict()) + def get_sdict(self): + rv = super().get_sdict() + rv['clientID'] = self._clientID + rv['respFLI'] = self._respFLI + return rv - def serialize(self): - return base64.b64encode(zlib.compress(json.dumps(self.get_sdict()).encode('utf-8'))).decode('ascii') + def builder(self): + cap_msg = super().builder() + client_msg = cap_msg.init(self.capnp_name) + client_msg.clientID = self._clientID + client_msg.respFLI = self._respFLI + return cap_msg - def __str__(self): - cn = self.__class__.__name__ - msg = f'{cn}: {self.tag}' - if hasattr(self, 'p_uid'): - msg += f' {self.p_uid}' + @property + def clientID(self): + return self._clientID - if hasattr(self, 'r_c_uid'): - msg += f'->{self.r_c_uid}' - return msg + @property + def respFLI(self): + return self._respFLI - def __repr__(self): - fields_to_set = self.get_sdict() - del fields_to_set['_tc'] - fs = ', '.join([f'{k!s}={v!r}' for k, v in fields_to_set.items()]) - return f'{self.__class__.__name__}({fs})' + @respFLI.setter + def respFLI(self, respFLI): + self._respFLI = respFLI +class DDDeregisterClientResponse(CapNProtoResponseMsg): + + _tc = MessageTypes.DD_DEREGISTER_CLIENT_RESPONSE + + def __init__(self, tag, ref, err, errInfo=''): + super().__init__(tag, ref, err, errInfo) # class setup methodology: # 1) the _tc class variable has the value of the typecode @@ -339,7 +1545,7 @@ def __repr__(self): # in the serialization the typecode is gotten from the # class attribute _tc. -class GSProcessCreate(_MsgBase): +class GSProcessCreate(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of @@ -385,8 +1591,8 @@ def __init__(self, tag, p_uid, r_c_uid, exe, args, env=None, rundir='', if _pmi_info is None: self._pmi_info = None elif isinstance(_pmi_info, dict): - self._pmi_info = PMIInfo.fromdict(_pmi_info) - elif isinstance(_pmi_info, PMIInfo): + self._pmi_info = PMIProcessInfo.fromdict(_pmi_info) + elif isinstance(_pmi_info, PMIProcessInfo): self._pmi_info = _pmi_info else: raise ValueError(f'GS unsupported _pmi_info value {_pmi_info=}') @@ -450,7 +1656,7 @@ def __str__(self): return super().__str__() + f'{self.exe} {self.args}' -class GSProcessCreateResponse(_MsgBase): +class GSProcessCreateResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a @@ -497,7 +1703,7 @@ def get_sdict(self): return rv -class GSProcessList(_MsgBase): +class GSProcessList(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -517,7 +1723,7 @@ def get_sdict(self): return rv -class GSProcessListResponse(_MsgBase): +class GSProcessListResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -544,7 +1750,7 @@ def get_sdict(self): return rv -class GSProcessQuery(_MsgBase): +class GSProcessQuery(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -578,7 +1784,7 @@ def get_sdict(self): return rv -class GSProcessQueryResponse(_MsgBase): +class GSProcessQueryResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -627,7 +1833,7 @@ def get_sdict(self): return rv -class GSProcessKill(_MsgBase): +class GSProcessKill(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -658,7 +1864,7 @@ def get_sdict(self): return rv -class GSProcessKillResponse(_MsgBase): +class GSProcessKillResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -691,7 +1897,7 @@ def get_sdict(self): return rv -class GSProcessJoin(_MsgBase): +class GSProcessJoin(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -727,7 +1933,7 @@ def __str__(self): return first + f' {self.t_p_uid}:{self.user_name}' -class GSProcessJoinResponse(_MsgBase): +class GSProcessJoinResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -773,7 +1979,7 @@ def __str__(self): return msg -class GSProcessJoinList(_MsgBase): +class GSProcessJoinList(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -782,7 +1988,11 @@ class GSProcessJoinList(_MsgBase): _tc = MessageTypes.GS_PROCESS_JOIN_LIST - def __init__(self, tag, p_uid, r_c_uid, timeout=-1, t_p_uid_list=None, user_name_list=None, join_all=False, _tc=None): + def __init__(self, tag, p_uid, r_c_uid, timeout=-1, + t_p_uid_list=None, user_name_list=None, + join_all=False, return_on_bad_exit=False, + _tc=None): + super().__init__(tag) self.p_uid = int(p_uid) self.r_c_uid = int(r_c_uid) @@ -794,6 +2004,7 @@ def __init__(self, tag, p_uid, r_c_uid, timeout=-1, t_p_uid_list=None, user_name self.user_name_list = user_name_list self.timeout = timeout self.join_all = join_all + self.return_on_bad_exit = return_on_bad_exit def get_sdict(self): rv = super().get_sdict() @@ -801,6 +2012,7 @@ def get_sdict(self): rv['r_c_uid'] = self.r_c_uid rv['timeout'] = self.timeout rv['join_all'] = self.join_all + rv['return_on_bad_exit'] = self.return_on_bad_exit if self.t_p_uid_list: rv['t_p_uid_list'] = self.t_p_uid_list if self.user_name_list: @@ -808,12 +2020,13 @@ def get_sdict(self): return rv - def __str__(self): - first = super().__str__() - return first + f' {self.t_p_uid_list}:{self.user_name_list}' + # TODO AICI-1422 Implement verbose logging options + # def __str__(self): + # first = super().__str__() + # return first + f' {self.t_p_uid_list}:{self.user_name_list}' -class GSProcessJoinListResponse(_MsgBase): +class GSProcessJoinListResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -847,7 +2060,7 @@ def __str__(self): return msg -class GSPoolCreate(_MsgBase): +class GSPoolCreate(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -891,7 +2104,7 @@ def get_sdict(self): return rv -class GSPoolCreateResponse(_MsgBase): +class GSPoolCreateResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -940,7 +2153,7 @@ def get_sdict(self): return rv -class GSPoolList(_MsgBase): +class GSPoolList(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -962,7 +2175,7 @@ def get_sdict(self): return rv -class GSPoolListResponse(_MsgBase): +class GSPoolListResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -989,7 +2202,7 @@ def get_sdict(self): return rv -class GSPoolQuery(_MsgBase): +class GSPoolQuery(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1024,7 +2237,7 @@ def get_sdict(self): return rv -class GSPoolQueryResponse(_MsgBase): +class GSPoolQueryResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1073,7 +2286,7 @@ def get_sdict(self): return rv -class GSPoolDestroy(_MsgBase): +class GSPoolDestroy(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1102,7 +2315,7 @@ def get_sdict(self): return rv -class GSPoolDestroyResponse(_MsgBase): +class GSPoolDestroyResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1135,7 +2348,7 @@ def get_sdict(self): return rv -class GSGroupCreate(_MsgBase): +class GSGroupCreate(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1161,6 +2374,16 @@ def __init__(self, tag, p_uid, r_c_uid, items=None, policy=None, user_name='', _ self.policy = Policy(**policy) elif isinstance(policy, Policy): self.policy = policy + elif isinstance(policy, list): + temp_policies = [] + for p in policy: + if isinstance(p, Policy): + temp_policies.append(p) + elif isinstance(p, dict): + temp_policies.append(Policy(**p)) + else: + raise ValueError(f'GS Groups unsupported policy value {p=}') + self.policy = temp_policies else: raise ValueError(f'GS Groups unsupported policy value {policy=}') @@ -1170,13 +2393,16 @@ def get_sdict(self): rv['p_uid'] = self.p_uid rv['r_c_uid'] = self.r_c_uid rv['items'] = self.items - rv['policy'] = self.policy.get_sdict() + if isinstance(self.policy, list): + rv['policy'] = [policy.get_sdict() if isinstance(policy, Policy) else policy for policy in self.policy] + else: + rv['policy'] = self.policy.get_sdict() rv['user_name'] = self.user_name return rv -class GSGroupCreateResponse(_MsgBase): +class GSGroupCreateResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1209,7 +2435,7 @@ def get_sdict(self): rv['desc'] = None if self.desc is None else self.desc.get_sdict() return rv -class GSGroupList(_MsgBase): +class GSGroupList(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1230,7 +2456,7 @@ def get_sdict(self): return rv -class GSGroupListResponse(_MsgBase): +class GSGroupListResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1257,7 +2483,7 @@ def get_sdict(self): return rv -class GSGroupQuery(_MsgBase): +class GSGroupQuery(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1292,7 +2518,7 @@ def get_sdict(self): return rv -class GSGroupQueryResponse(_MsgBase): +class GSGroupQueryResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1341,7 +2567,7 @@ def get_sdict(self): return rv -class GSGroupKill(_MsgBase): +class GSGroupKill(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1372,7 +2598,7 @@ def get_sdict(self): return rv -class GSGroupKillResponse(_MsgBase): +class GSGroupKillResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1420,7 +2646,7 @@ def get_sdict(self): return rv -class GSGroupDestroy(_MsgBase): +class GSGroupDestroy(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1449,7 +2675,7 @@ def get_sdict(self): return rv -class GSGroupDestroyResponse(_MsgBase): +class GSGroupDestroyResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1496,7 +2722,7 @@ def get_sdict(self): return rv -class GSGroupAddTo(_MsgBase): +class GSGroupAddTo(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1532,7 +2758,7 @@ def get_sdict(self): return rv -class GSGroupAddToResponse(_MsgBase): +class GSGroupAddToResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1581,7 +2807,7 @@ def get_sdict(self): return rv -class GSGroupCreateAddTo(_MsgBase): +class GSGroupCreateAddTo(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1627,7 +2853,7 @@ def get_sdict(self): return rv -class GSGroupCreateAddToResponse(_MsgBase): +class GSGroupCreateAddToResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1676,7 +2902,7 @@ def get_sdict(self): return rv -class GSGroupRemoveFrom(_MsgBase): +class GSGroupRemoveFrom(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1712,7 +2938,7 @@ def get_sdict(self): return rv -class GSGroupRemoveFromResponse(_MsgBase): +class GSGroupRemoveFromResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1761,7 +2987,7 @@ def get_sdict(self): return rv -class GSGroupDestroyRemoveFrom(_MsgBase): +class GSGroupDestroyRemoveFrom(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1797,7 +3023,7 @@ def get_sdict(self): return rv -class GSGroupDestroyRemoveFromResponse(_MsgBase): +class GSGroupDestroyRemoveFromResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1846,7 +3072,7 @@ def get_sdict(self): return rv -class GSChannelCreate(_MsgBase): +class GSChannelCreate(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1888,7 +3114,7 @@ def options(self, value): self._options = channel_desc.ChannelOptions.from_sdict(value) -class GSChannelCreateResponse(_MsgBase): +class GSChannelCreateResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1933,7 +3159,7 @@ def get_sdict(self): return rv -class GSChannelList(_MsgBase): +class GSChannelList(InfraMsg): """ Refer to :ref:`definition` and to :ref:`Common Fields` for a description of the message structure. @@ -1953,7 +3179,7 @@ def get_sdict(self): return rv -class GSChannelListResponse(_MsgBase): +class GSChannelListResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -1979,7 +3205,7 @@ def get_sdict(self): return rv -class GSChannelQuery(_MsgBase): +class GSChannelQuery(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2016,7 +3242,7 @@ def get_sdict(self): return rv -class GSChannelQueryResponse(_MsgBase): +class GSChannelQueryResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2067,7 +3293,7 @@ def get_sdict(self): return rv -class GSChannelDestroy(_MsgBase): +class GSChannelDestroy(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2101,7 +3327,7 @@ def get_sdict(self): return rv -class GSChannelDestroyResponse(_MsgBase): +class GSChannelDestroyResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2130,7 +3356,7 @@ def get_sdict(self): return rv -class GSChannelJoin(_MsgBase): +class GSChannelJoin(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2155,7 +3381,7 @@ def get_sdict(self): return rv -class GSChannelJoinResponse(_MsgBase): +class GSChannelJoinResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2198,7 +3424,7 @@ def get_sdict(self): return rv -class GSChannelDetach(_MsgBase): +class GSChannelDetach(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2221,7 +3447,7 @@ def get_sdict(self): return rv -class GSChannelDetachResponse(_MsgBase): +class GSChannelDetachResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2245,7 +3471,7 @@ def get_sdict(self): return rv -class GSChannelGetSendH(_MsgBase): +class GSChannelGetSendH(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2270,7 +3496,7 @@ def get_sdict(self): return rv -class GSChannelGetSendHResponse(_MsgBase): +class GSChannelGetSendHResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2321,7 +3547,7 @@ def get_sdict(self): return rv -class GSChannelGetRecvH(_MsgBase): +class GSChannelGetRecvH(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2346,7 +3572,7 @@ def get_sdict(self): return rv -class GSChannelGetRecvHResponse(_MsgBase): +class GSChannelGetRecvHResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2396,7 +3622,7 @@ def get_sdict(self): return rv -class GSNodeList(_MsgBase): +class GSNodeList(InfraMsg): """ *type enum* GS_NODE_LIST (= 102) @@ -2432,7 +3658,7 @@ def get_sdict(self): return rv -class GSNodeListResponse(_MsgBase): +class GSNodeListResponse(InfraMsg): """ *type enum* GS_NODE_LIST_RESPONSE (= 103) @@ -2472,7 +3698,7 @@ def get_sdict(self): return rv -class GSNodeQuery(_MsgBase): +class GSNodeQuery(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2499,7 +3725,7 @@ def get_sdict(self): return rv -class GSNodeQueryResponse(_MsgBase): +class GSNodeQueryResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2550,7 +3776,7 @@ def get_sdict(self): return rv -class GSNodeQueryTotalCPUCount(_MsgBase): +class GSNodeQueryTotalCPUCount(InfraMsg): """ *type enum* GS_NODE_QUERY_TOTAL_CPU_COUNT (= 104) @@ -2580,7 +3806,7 @@ def get_sdict(self): return rv -class GSNodeQueryTotalCPUCountResponse(_MsgBase): +class GSNodeQueryTotalCPUCountResponse(InfraMsg): """ *type enum* GS_NODE_QUERY_TOTAL_CPU_COUNT_RESPONSE (= 105) @@ -2647,7 +3873,7 @@ def get_sdict(self): return rv -class AbnormalTermination(_MsgBase): +class AbnormalTermination(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2656,13 +3882,15 @@ class AbnormalTermination(_MsgBase): _tc = MessageTypes.ABNORMAL_TERMINATION - def __init__(self, tag, err_info='', _tc=None): + def __init__(self, tag, err_info='', host_id=0, _tc=None): super().__init__(tag) self.err_info = err_info + self.host_id = host_id def get_sdict(self): rv = super().get_sdict() rv['err_info'] = self.err_info + rv['host_id'] = self.host_id return rv def __str__(self): @@ -2673,7 +3901,7 @@ def __str__(self): return str(super()) -class ExceptionlessAbort(_MsgBase): +class ExceptionlessAbort(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2690,7 +3918,7 @@ def get_sdict(self): return rv -class GSStarted(_MsgBase): +class GSStarted(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2709,7 +3937,7 @@ def get_sdict(self): return rv -class GSPingSH(_MsgBase): +class GSPingSH(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2726,7 +3954,7 @@ def get_sdict(self): return rv -class GSIsUp(_MsgBase): +class GSIsUp(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2743,7 +3971,7 @@ def get_sdict(self): return rv -class GSPingProc(_MsgBase): +class GSPingProc(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2768,14 +3996,14 @@ def get_sdict(self): return rv -class GSDump(_MsgBase): +class GSDumpState(InfraMsg): """ - Refer to :ref:`definition` and :ref:`Common Fields` for a description of the + Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. """ - _tc = MessageTypes.GS_DUMP + _tc = MessageTypes.GS_DUMP_STATE def __init__(self, tag, filename, _tc=None): super().__init__(tag) @@ -2787,7 +4015,7 @@ def get_sdict(self): return rv -class GSHeadExit(_MsgBase): +class GSHeadExit(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2806,7 +4034,7 @@ def get_sdict(self): return rv -class GSTeardown(_MsgBase): +class GSTeardown(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2823,7 +4051,7 @@ def get_sdict(self): return rv -class GSUnexpected(_MsgBase): +class GSUnexpected(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2840,7 +4068,7 @@ def get_sdict(self): return rv -class GSChannelRelease(_MsgBase): +class GSChannelRelease(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2857,7 +4085,7 @@ def get_sdict(self): return rv -class GSHalted(_MsgBase): +class GSHalted(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2874,7 +4102,7 @@ def get_sdict(self): return rv -class SHProcessCreate(_MsgBase): +class SHProcessCreate(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -2889,9 +4117,10 @@ class SHProcessCreate(_MsgBase): _tc = MessageTypes.SH_PROCESS_CREATE - def __init__(self, tag, p_uid, r_c_uid, t_p_uid, exe, args, env=None, rundir='', options=None, initial_stdin='', - stdin=None, stdout=None, stderr=None, group=None, user=None, umask=- 1, pipesize=None, - stdin_msg=None, stdout_msg=None, stderr_msg=None, pmi_info=None, layout=None, _tc=None): + def __init__(self, tag, p_uid, r_c_uid, t_p_uid, exe, args, env=None, rundir='', options=None, + initial_stdin='', stdin=None, stdout=None, stderr=None, group=None, user=None, + umask=- 1, pipesize=None, stdin_msg=None, stdout_msg=None, stderr_msg=None, + pmi_info=None, layout=None, gs_ret_chan_msg=None, _tc=None): super().__init__(tag) if options is None: @@ -2934,8 +4163,8 @@ def __init__(self, tag, p_uid, r_c_uid, t_p_uid, exe, args, env=None, rundir='', if pmi_info is None: self.pmi_info = None elif isinstance(pmi_info, dict): - self.pmi_info = PMIInfo.fromdict(pmi_info) - elif isinstance(pmi_info, PMIInfo): + self.pmi_info = PMIProcessInfo.fromdict(pmi_info) + elif isinstance(pmi_info, PMIProcessInfo): self.pmi_info = pmi_info else: raise ValueError(f'LS unsupported pmi_info value {pmi_info=}') @@ -2949,6 +4178,11 @@ def __init__(self, tag, p_uid, r_c_uid, t_p_uid, exe, args, env=None, rundir='', else: raise ValueError(f'LS unsupported layout value {layout=}') + if gs_ret_chan_msg is None or isinstance(gs_ret_chan_msg, SHChannelCreate): + self.gs_ret_chan_msg = gs_ret_chan_msg + else: + self.gs_ret_chan_msg = SHChannelCreate.from_sdict(gs_ret_chan_msg) + @property def options(self): return self._options @@ -2985,10 +4219,11 @@ def get_sdict(self): rv['stderr_msg'] = (None if self.stderr_msg is None else self.stderr_msg.get_sdict()) rv['pmi_info'] = None if self.pmi_info is None else asdict(self.pmi_info) rv['layout'] = None if self.layout is None else asdict(self.layout) + rv['gs_ret_chan_msg'] = (None if self.gs_ret_chan_msg is None else self.gs_ret_chan_msg.get_sdict()) return rv -class SHProcessCreateResponse(_MsgBase): +class SHProcessCreateResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3002,7 +4237,8 @@ class Errors(enum.Enum): SUCCESS = 0 #: FAIL = 1 #: - def __init__(self, tag, ref, err, err_info='', stdin_resp=None, stdout_resp=None, stderr_resp=None, _tc=None): + def __init__(self, tag, ref, err, err_info='', stdin_resp=None, stdout_resp=None, + stderr_resp=None, gs_ret_chan_resp = None, _tc=None): super().__init__(tag, ref, err) self.err_info = err_info @@ -3022,6 +4258,11 @@ def __init__(self, tag, ref, err, err_info='', stdin_resp=None, stdout_resp=None else: self.stderr_resp = SHChannelCreateResponse.from_sdict(stderr_resp) + if gs_ret_chan_resp is None or isinstance(gs_ret_chan_resp, SHChannelCreateResponse): + self.gs_ret_chan_resp = gs_ret_chan_resp + else: + self.gs_ret_chan_resp = SHChannelCreateResponse.from_sdict(gs_ret_chan_resp) + def get_sdict(self): rv = super().get_sdict() if self.Errors.FAIL == self.err: @@ -3029,10 +4270,11 @@ def get_sdict(self): rv['stdin_resp'] = (None if self.stdin_resp is None else self.stdin_resp.get_sdict()) rv['stdout_resp'] = (None if self.stdout_resp is None else self.stdout_resp.get_sdict()) rv['stderr_resp'] = (None if self.stderr_resp is None else self.stderr_resp.get_sdict()) + rv['gs_ret_chan_resp'] = (None if self.gs_ret_chan_resp is None else self.gs_ret_chan_resp.get_sdict()) return rv -class SHProcessKill(_MsgBase): +class SHProcessKill(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3057,7 +4299,7 @@ def get_sdict(self): return rv -class SHProcessKillResponse(_MsgBase): +class SHProcessKillResponse(InfraMsg): _tc = MessageTypes.SH_PROCESS_KILL_RESPONSE @enum.unique @@ -3077,7 +4319,7 @@ def get_sdict(self): return rv -class SHProcessExit(_MsgBase): +class SHProcessExit(InfraMsg): """ Refer to :ref:`definition` and to :ref:`Common Fields` for a description of @@ -3103,7 +4345,91 @@ def get_sdict(self): return rv -class SHPoolCreate(_MsgBase): +class SHMultiProcessCreate(InfraMsg): + + _tc = MessageTypes.SH_MULTI_PROCESS_CREATE + + def __init__(self, tag, r_c_uid, procs : List[Union[Dict, SHProcessCreate]], + pmi_group_info : Optional[PMIGroupInfo] = None, _tc=None): + super().__init__(tag) + self.r_c_uid = int(r_c_uid) + + if pmi_group_info is None: + self.pmi_group_info = None + elif isinstance(pmi_group_info, dict): + self.pmi_group_info = PMIGroupInfo.fromdict(pmi_group_info) + elif isinstance(pmi_group_info, PMIGroupInfo): + self.pmi_group_info = pmi_group_info + else: + raise ValueError(f'GS unsupported pmi_group_info value {pmi_group_info=}') + + self.procs = [] + for proc in procs: + if isinstance(proc, SHProcessCreate): + self.procs.append(proc) + elif isinstance(proc, dict): + self.procs.append(SHProcessCreate.from_sdict(proc)) + else: + raise ValueError(f'proc is not a supported type %s', type(proc)) + + def get_sdict(self): + rv = super().get_sdict() + rv['r_c_uid'] = self.r_c_uid + rv['pmi_group_info'] = None if self.pmi_group_info is None else asdict(self.pmi_group_info) + rv["procs"] = [proc.get_sdict() for proc in self.procs] + return rv + + +class SHMultiProcessCreateResponse(InfraMsg): + + _tc = MessageTypes.SH_MULTI_PROCESS_CREATE_RESPONSE + + @enum.unique + class Errors(enum.Enum): + SUCCESS = 0 + FAIL = 1 + + def __init__( + self, + tag, + ref, + err, + err_info="", + exit_code=0, + responses: List[Union[Dict, SHProcessCreateResponse]] = None, + failed: bool = False, + _tc=None, + ): + super().__init__(tag, ref, err) + self.err_info = err_info + self.exit_code = exit_code + + self.failed = failed + self.responses = [] + for response in responses: + if isinstance(response, SHProcessCreateResponse): + self.responses.append(response) + elif isinstance(response, dict): + self.responses.append(SHProcessCreateResponse.from_sdict(response)) + else: + raise ValueError(f'response is not a supported type %s', type(response)) + + def get_sdict(self): + rv = super().get_sdict() + rv['exit_code'] = self.exit_code + + if self.err == self.Errors.SUCCESS: + rv['failed'] = self.failed + rv["responses"] = [response.get_sdict() for response in self.responses] + elif self.err == self.Errors.FAIL: + rv['err_info'] = self.err_info + else: + raise NotImplementedError('close case') + + return rv + + +class SHPoolCreate(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3133,7 +4459,7 @@ def get_sdict(self): return rv -class SHPoolCreateResponse(_MsgBase): +class SHPoolCreateResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3169,7 +4495,7 @@ def get_sdict(self): return rv -class SHPoolDestroy(_MsgBase): +class SHPoolDestroy(InfraMsg): """ Refer to :ref:`definition` and to the :ref:`Common Fields` for a description of @@ -3193,7 +4519,7 @@ def get_sdict(self): return rv -class SHPoolDestroyResponse(_MsgBase): +class SHPoolDestroyResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3219,7 +4545,7 @@ def get_sdict(self): return rv -class SHExecMemRequest(_MsgBase): +class SHExecMemRequest(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3250,7 +4576,7 @@ def get_sdict(self): return rv -class SHExecMemResponse(_MsgBase): +class SHExecMemResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3282,7 +4608,7 @@ def get_sdict(self): return rv -class SHChannelCreate(_MsgBase): +class SHChannelCreate(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3324,7 +4650,7 @@ def options(self, value): self._options = dso.ChannelOptions.from_sdict(value) -class SHChannelCreateResponse(_MsgBase): +class SHChannelCreateResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3354,7 +4680,7 @@ def get_sdict(self): return rv -class SHChannelDestroy(_MsgBase): +class SHChannelDestroy(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3378,7 +4704,7 @@ def get_sdict(self): return rv -class SHChannelDestroyResponse(_MsgBase): +class SHChannelDestroyResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3404,7 +4730,7 @@ def get_sdict(self): return rv -class SHLockChannel(_MsgBase): +class SHLockChannel(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3425,7 +4751,7 @@ def get_sdict(self): return rv -class SHLockChannelResponse(_MsgBase): +class SHLockChannelResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3448,7 +4774,7 @@ def get_sdict(self): return rv -class SHAllocMsg(_MsgBase): +class SHAllocMsg(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3469,7 +4795,7 @@ def get_sdict(self): return rv -class SHAllocMsgResponse(_MsgBase): +class SHAllocMsgResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3490,7 +4816,7 @@ def get_sdict(self): return rv -class SHAllocBlock(_MsgBase): +class SHAllocBlock(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3511,7 +4837,7 @@ def get_sdict(self): return rv -class SHAllocBlockResponse(_MsgBase): +class SHAllocBlockResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3546,7 +4872,7 @@ def get_sdict(self): return rv -class SHChannelsUp(_MsgBase): +class SHChannelsUp(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3576,7 +4902,7 @@ def get_sdict(self): return rv -class SHPingGS(_MsgBase): +class SHPingGS(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3597,7 +4923,7 @@ def get_sdict(self): return rv -class SHTeardown(_MsgBase): +class SHTeardown(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3614,7 +4940,7 @@ def get_sdict(self): return rv -class SHPingBE(_MsgBase): +class SHPingBE(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3622,7 +4948,7 @@ class SHPingBE(_MsgBase): """ _tc = MessageTypes.SH_PING_BE - EMPTY = B64.bytes_to_str(b'') + EMPTY = b64encode(b'') def __init__(self, tag, shep_cd=EMPTY, be_cd=EMPTY, gs_cd=EMPTY, default_pd=EMPTY, inf_pd=EMPTY, _tc=None): @@ -3643,7 +4969,7 @@ def get_sdict(self): return rv -class SHHaltTA(_MsgBase): +class SHHaltTA(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3660,7 +4986,7 @@ def get_sdict(self): return rv -class SHHaltBE(_MsgBase): +class SHHaltBE(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3677,7 +5003,7 @@ def get_sdict(self): return rv -class SHHalted(_MsgBase): +class SHHalted(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3696,7 +5022,7 @@ def get_sdict(self): return rv -class SHFwdInput(_MsgBase): +class SHFwdInput(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3728,7 +5054,7 @@ def get_sdict(self): return rv -class SHFwdInputErr(_MsgBase): +class SHFwdInputErr(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3756,7 +5082,7 @@ def get_sdict(self): return rv -class SHFwdOutput(_MsgBase): +class SHFwdOutput(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3800,14 +5126,14 @@ def __str__(self): return f'{super().__str__()}, self.data={self.data!r}, self.p_uid={self.p_uid!r}, self.pid={self.pid!r}, self.fd_num={self.fd_num!r}' -class SHDumpState(_MsgBase): +class SHDumpState(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. """ - _tc = MessageTypes.SH_DUMP + _tc = MessageTypes.SH_DUMP_STATE def __init__(self, tag, filename=None, _tc=None): super().__init__(tag) @@ -3819,14 +5145,14 @@ def get_sdict(self): return rv -class BENodeIdxSH(_MsgBase): +class BENodeIdxSH(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. """ - _tc = MessageTypes.BE_NODEINDEX_SH + _tc = MessageTypes.BE_NODE_IDX_SH def __init__(self, tag, node_idx, host_name=None, ip_addrs=None, primary=None, logger_sdesc=None, _tc=None): @@ -3861,7 +5187,7 @@ def get_sdict(self): return rv -class BEPingSH(_MsgBase): +class BEPingSH(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3878,7 +5204,7 @@ def get_sdict(self): return rv -class BEHalted(_MsgBase): +class BEHalted(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3895,7 +5221,7 @@ def get_sdict(self): return rv -class LABroadcast(_MsgBase): +class LABroadcast(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -3917,14 +5243,14 @@ def get_sdict(self): return rv -class LAPassThruFB(_MsgBase): +class LAPassThruFB(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. """ - _tc = MessageTypes.LA_PASSTHRU_FB + _tc = MessageTypes.LA_PASS_THRU_FB def __init__(self, tag, c_uid, data, _tc=None): super().__init__(tag) @@ -3938,14 +5264,14 @@ def get_sdict(self): return rv -class LAPassThruBF(_MsgBase): +class LAPassThruBF(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. """ - _tc = MessageTypes.LA_PASSTHRU_BF + _tc = MessageTypes.LA_PASS_THRU_BF def __init__(self, tag, data, _tc=None): super().__init__(tag) @@ -3957,7 +5283,7 @@ def get_sdict(self): return rv -class LAServerMode(_MsgBase): +class LAServerMode(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4004,7 +5330,7 @@ def get_sdict(self): # TODO FIXME: if messages are in this hierarchy they must follow the rules. # This one does not; the spec needs fixing too. -class LAServerModeExit(_MsgBase): +class LAServerModeExit(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4028,7 +5354,7 @@ def get_sdict(self): return rv -class LAProcessDict(_MsgBase): +class LAProcessDict(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4045,7 +5371,7 @@ def get_sdict(self): return rv -class LAProcessDictResponse(_MsgBase): +class LAProcessDictResponse(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4072,14 +5398,14 @@ def get_sdict(self): return rv -class LADumpState(_MsgBase): +class LADumpState(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. """ - _tc = MessageTypes.LA_DUMP + _tc = MessageTypes.LA_DUMP_STATE def __init__(self, tag, filename=None, _tc=None): super().__init__(tag) @@ -4091,7 +5417,7 @@ def get_sdict(self): return rv -class LAChannelsInfo(_MsgBase): +class LAChannelsInfo(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4107,6 +5433,10 @@ def __init__(self, tag, nodes_desc, gs_cd, num_gw_channels, port=dfacts.DEFAULT_ self.gs_cd = gs_cd self.transport = dfacts.TransportAgentOptions.from_str(transport) self.num_gw_channels = num_gw_channels + try: + self.fe_ext_ip_addr = get_external_ip_addr() + except OSError: + self.fe_ext_ip_addr = None self.nodes_desc = {} for key in nodes_desc.keys(): @@ -4130,7 +5460,7 @@ def get_sdict(self): return rv -class LoggingMsg(_MsgBase): +class LoggingMsg(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4179,7 +5509,7 @@ def get_logging_dict(self): return rv -class LoggingMsgList(_MsgBase): +class LoggingMsgList(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4212,7 +5542,7 @@ def get_sdict(self): return rv -class LogFlushed(_MsgBase): +class LogFlushed(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4229,7 +5559,7 @@ def get_sdict(self): return rv -class TAPingSH(_MsgBase): +class TAPingSH(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4246,7 +5576,7 @@ def get_sdict(self): return rv -class TAHalted(_MsgBase): +class TAHalted(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4263,7 +5593,7 @@ def get_sdict(self): return rv -class TAUp(_MsgBase): +class TAUp(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4289,7 +5619,7 @@ def get_sdict(self): return rv -class Breakpoint(_MsgBase): +class Breakpoint(InfraMsg): _tc = MessageTypes.BREAKPOINT def __init__(self, tag, p_uid, index, out_desc, in_desc, _tc=None): @@ -4308,7 +5638,7 @@ def get_sdict(self): return rv -class BEIsUp(_MsgBase): +class BEIsUp(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4328,7 +5658,7 @@ def get_sdict(self): return rv -class FENodeIdxBE(_MsgBase): +class FENodeIdxBE(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4388,7 +5718,7 @@ def get_sdict(self): return rv -class HaltLoggingInfra(_MsgBase): +class HaltLoggingInfra(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4405,7 +5735,7 @@ def get_sdict(self): return rv -class HaltOverlay(_MsgBase): +class HaltOverlay(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4422,7 +5752,7 @@ def get_sdict(self): return rv -class OverlayHalted(_MsgBase): +class OverlayHalted(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4438,7 +5768,7 @@ def get_sdict(self): return rv -class BEHaltOverlay(_MsgBase): +class BEHaltOverlay(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4454,7 +5784,7 @@ def get_sdict(self): return rv -class LAHaltOverlay(_MsgBase): +class LAHaltOverlay(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4470,7 +5800,7 @@ def get_sdict(self): return rv -class OverlayPingBE(_MsgBase): +class OverlayPingBE(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4486,7 +5816,7 @@ def get_sdict(self): return rv -class OverlayPingLA(_MsgBase): +class OverlayPingLA(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4502,7 +5832,7 @@ def get_sdict(self): return rv -class LAExit(_MsgBase): +class LAExit(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. @@ -4519,14 +5849,61 @@ def get_sdict(self): rv['sigint'] = self.sigint return rv +class RuntimeDesc(InfraMsg): + """ + Refer to :ref:`definition` and :ref:`Common Fields` for a description of the + message structure. + """ + _tc = MessageTypes.RUNTIME_DESC + + def __init__(self, tag, gs_cd, gs_ret_cd, ls_cd, ls_ret_cd, fe_ext_ip_addr, head_node_ip_addr, oob_port, env, _tc=None): + super().__init__(tag) + self.gs_cd = gs_cd + self.gs_ret_cd = gs_ret_cd + self.ls_cd = ls_cd + self.ls_ret_cd = ls_ret_cd + # should we add "username" to the sdesc? + self.fe_ext_ip_addr = fe_ext_ip_addr + self.head_node_ip_addr = head_node_ip_addr + self.oob_port = oob_port + self.env = json.dumps(dict(env)) + # add something to help deal with differences in dir structure? + + def get_sdict(self): + rv = super().get_sdict() + rv['gs_cd'] = self.gs_cd + rv['gs_ret_cd'] = self.gs_ret_cd + rv['ls_cd'] = self.ls_cd + rv['ls_ret_cd'] = self.ls_ret_cd + rv['fe_ext_ip_addr'] = self.fe_ext_ip_addr + rv['head_node_ip_addr'] = self.head_node_ip_addr + rv['oob_port'] = self.oob_port + rv['env'] = json.loads(self.env) + return rv + +class UserHaltOOB(InfraMsg): + """ + Refer to :ref:`definition` and :ref:`Common Fields` for a description of the + message structure. + + """ + + _tc = MessageTypes.USER_HALT_OOB + + def __init__(self, tag, _tc=None): + super().__init__(tag) + + def get_sdict(self): + rv = super().get_sdict() + return rv -class TAUpdateNodes(_MsgBase): +class TAUpdateNodes(InfraMsg): """ Refer to :ref:`definition` and :ref:`Common Fields` for a description of the message structure. """ - _tc = MessageTypes.HSTA_UPDATE_NODES + _tc = MessageTypes.TA_UPDATE_NODES def __init__(self, tag, nodes: list[Union[NodeDescriptor, dict]], @@ -4554,182 +5931,79 @@ def get_sdict(self): return rv -all_message_classes = [GSProcessCreate, - GSProcessCreateResponse, - GSProcessList, - GSProcessListResponse, - GSProcessQuery, - GSProcessQueryResponse, - GSProcessKill, - GSProcessKillResponse, - GSProcessJoin, - GSProcessJoinResponse, - GSChannelCreate, - GSChannelCreateResponse, - GSChannelList, - GSChannelListResponse, - GSChannelQuery, - GSChannelQueryResponse, - GSChannelDestroy, - GSChannelDestroyResponse, - GSChannelJoin, - GSChannelJoinResponse, - GSChannelDetach, - GSChannelDetachResponse, - GSChannelGetSendH, - GSChannelGetSendHResponse, - GSChannelGetRecvH, - GSChannelGetRecvHResponse, - AbnormalTermination, - GSStarted, - GSPingSH, - GSIsUp, - GSHeadExit, - GSChannelRelease, - GSHalted, - SHProcessCreate, - SHProcessCreateResponse, - SHProcessKill, - SHProcessExit, - SHChannelCreate, - SHChannelCreateResponse, - SHChannelDestroy, - SHChannelDestroyResponse, - SHLockChannel, - SHLockChannelResponse, - SHAllocMsg, - SHAllocMsgResponse, - SHAllocBlock, - SHAllocBlockResponse, - SHChannelsUp, - SHPingGS, - SHHalted, - SHFwdInput, - SHFwdInputErr, - SHFwdOutput, - SHDumpState, - GSTeardown, - SHTeardown, - SHPingBE, - BEPingSH, - BENodeIdxSH, - TAPingSH, - SHHaltTA, - TAHalted, - SHHaltBE, - BEHalted, - TAUp, - GSPingProc, - GSDump, - LABroadcast, - LAChannelsInfo, - LAPassThruBF, - LAPassThruFB, - LAServerMode, - LAServerModeExit, - LAProcessDict, - LAProcessDictResponse, - LADumpState, - GSPoolList, - GSPoolListResponse, - GSPoolCreate, - GSPoolCreateResponse, - GSPoolDestroy, - GSPoolDestroyResponse, - GSPoolQuery, - GSPoolQueryResponse, - SHPoolCreate, - SHPoolCreateResponse, - SHPoolDestroy, - SHPoolDestroyResponse, - SHExecMemRequest, - SHExecMemResponse, - GSUnexpected, - SHProcessKillResponse, - Breakpoint, - GSProcessJoinList, - GSProcessJoinListResponse, - GSNodeList, - GSNodeListResponse, - GSNodeQuery, - GSNodeQueryResponse, - GSNodeQueryTotalCPUCount, - GSNodeQueryTotalCPUCountResponse, - LoggingMsg, - LoggingMsgList, - LogFlushed, - BEIsUp, - FENodeIdxBE, - HaltOverlay, - HaltLoggingInfra, - OverlayPingBE, - OverlayPingLA, - LAHaltOverlay, - BEHaltOverlay, - OverlayHalted, - ExceptionlessAbort, - LAExit, - GSGroupCreate, - GSGroupCreateResponse, - GSGroupList, - GSGroupQuery, - GSGroupListResponse, - GSGroupCreateResponse, - GSGroupQueryResponse, - GSGroupKill, - GSGroupKillResponse, - GSGroupDestroy, - GSGroupDestroyResponse, - GSGroupAddTo, - GSGroupAddToResponse, - GSGroupRemoveFrom, - GSGroupRemoveFromResponse, - GSGroupCreateAddTo, - GSGroupCreateAddToResponse, - GSGroupDestroyRemoveFrom, - GSGroupDestroyRemoveFromResponse, - TAUpdateNodes] +PREDETERMINED_CAPS = {'GS':'GS', 'SH':'SH', 'TA':'TA', 'BE':'BE', 'FE':'FE', 'LA':'LA', 'BF':'BF', + 'FB':'FB', 'DD':'DD', 'SENDH':'SendH', 'RECVH':'RecvH', 'CPU': 'CPU', 'ID':'ID', + 'OOB':'OOB', 'KVL':'KVL', 'KV':'KV'} -mt_dispatch = {cls._tc.value: cls for cls in all_message_classes} +MSG_TYPES_WITHOUT_CLASSES = {MessageTypes.DRAGON_MSG} + +def type_filter(the_msg_types): + msg_types = set(the_msg_types) - MSG_TYPES_WITHOUT_CLASSES + return msg_types + +def camel_case_msg_name(msg_id): + + lst = msg_id.split('.')[1].split('_') + cased = [] + + for word in lst: + if word in PREDETERMINED_CAPS: + cased.append(PREDETERMINED_CAPS[word]) + else: + cased.append(word[0].upper()+word[1:].lower()) + + converted = "".join(cased) + return converted +def mk_all_message_classes_set(): + result = set() + for msg_id in type_filter(MessageTypes): + try: + class_name = camel_case_msg_name(str(msg_id)) + class_def = getattr(sys.modules[__name__],class_name) + result.add(class_def) + except: + raise TypeError(f'Unable to find corresponding class {class_name} for message id {msg_id}.') + + return result -def parse(jstring, restrict=None): +all_message_classes = mk_all_message_classes_set() + +mt_dispatch = {cls._tc.value: cls for cls in all_message_classes} +def parse(serialized, restrict=None): try: # if a compressed message, decompress to get the service message - jstring = zlib.decompress(base64.b64decode(jstring)) - except zlib.error as zerr: - pass + try: + decoded = b64decode(serialized) + except: + decoded = serialized + + try: + jstring = zlib.decompress(decoded) + except zlib.error: + jstring = decoded + - try: sdict = json.loads(jstring) - except TypeError as e: - raise TypeError(f'The message "{jstring}" could not be parsed.') from e - - typecode = sdict['_tc'] - - if restrict: - assert typecode in restrict - - return mt_dispatch[typecode].from_sdict(sdict) - - -# types of all the messages that global services can receive -all_gs_messages = {GSProcessCreate, GSProcessList, GSProcessQuery, - GSProcessKill, GSProcessJoin, GSChannelCreate, - GSChannelList, GSChannelQuery, GSChannelDestroy, - GSChannelJoin, GSChannelDetach, GSChannelGetSendH, - GSChannelGetRecvH, GSChannelRelease, SHProcessCreateResponse, - SHProcessExit, SHChannelCreateResponse, - SHChannelDestroyResponse, SHLockChannelResponse, - SHAllocMsgResponse, SHAllocBlockResponse, - SHPingGS, GSTeardown, GSDump, GSPoolList, GSPoolCreate, GSPoolQuery, - GSPoolDestroy, SHProcessKillResponse, - GSPoolListResponse, GSPoolDestroyResponse, GSPoolCreateResponse, - GSPoolQueryResponse, GSTeardown, - SHPoolCreateResponse, SHPoolDestroyResponse, GSProcessJoinList, - GSNodeList, GSNodeQuery, GSNodeQueryTotalCPUCount, - GSGroupList, GSGroupCreate, GSGroupQuery, - GSGroupListResponse, GSGroupDestroy, GSGroupAddTo, - GSGroupRemoveFrom, GSGroupCreateAddTo, - GSGroupKill, GSGroupDestroyRemoveFrom} + typecode = sdict['_tc'] + if restrict: + assert typecode in restrict + + return mt_dispatch[typecode].from_sdict(sdict) + + + except Exception as json_exception: + try: + # A DecodeError probaby indicates this is a CapnProto message so we'll + # try parsing it that way before returning + + msg = CapNProtoMsg.deserialize(serialized) + + if restrict: + assert msg.tc in restrict + + return msg + except Exception as ex: + tb = traceback.format_exc() + raise TypeError(f'The message "{serialized}" could not be parsed.\nJSON Parsing Error Message:{json_exception}\nCapnProto Parsing Error Message:{ex}\n Traceback {tb}') diff --git a/src/dragon/infrastructure/node_desc.py b/src/dragon/infrastructure/node_desc.py index 40b84d7..09d9c94 100644 --- a/src/dragon/infrastructure/node_desc.py +++ b/src/dragon/infrastructure/node_desc.py @@ -8,7 +8,7 @@ import enum import re import os -from socket import gethostname, socket, AF_INET, SOCK_STREAM +from socket import gethostname from typing import Optional from .facts import DEFAULT_TRANSPORT_NETIF, DEFAULT_OVERLAY_NETWORK_PORT, DEFAULT_PORT_RANGE @@ -26,7 +26,9 @@ class State(enum.IntEnum): DISCOVERABLE = enum.auto() PENDING = enum.auto() ACTIVE = enum.auto() + IDLE = enum.auto() ERROR = enum.auto() + DOWN = enum.auto() # TODO: this is a stub (PE-42397). How do we deal with: # * networks and their encryption ? @@ -84,40 +86,80 @@ def __str__(self): return f"name:{self.name}, host_id:{self.host_id} at {self.ip_addrs}, state:{self.state.name}" @classmethod - def make_for_current_node(cls, name: Optional[str] = None, ip_addrs: Optional[list[str]] = None, is_primary: bool = False): - """Create a serialized node descriptor for the node this Shepherd is running on. - Can only be used to send to GS, as h_uid will be None in the descriptor. - - :param name: hostname of this node, defaults to None - :type name: str, optional - :param ip_addrs: List of the IP addresses for this node, defaults to ["127.0.0.1"] - :type ip_addrs: list[str], optional + def get_localservices_node_conf(cls, + name: Optional[str] = None, + host_name: Optional[str] = None, + host_id: Optional[int] = None, + is_primary: bool = False, + ip_addrs: Optional[list[str]] = None, + shep_cd: Optional[str] = None, + cpu_devices: Optional[list[int]] = None, + accelerators: Optional[AcceleratorDescriptor] = None): + """Return a NodeDescriptor object for Local Services to pass into its SHChannelsUp message + + Populates the values in a NodeDescriptor object that Local Services needs to provide to the + launcher frontend as part of infrastructure bring-up + + + :param name: Name for node. Often resorts to hostname, defaults to None + :type name: Optional[str], optional + :param host_name: Hostname for the node, defaults to gethostname() + :type host_name: Optional[str], optional + :param host_id: unique host ID of this node, defaults to get_host_id() + :type host_id: Optional[int], optional :param is_primary: denote if this is the primary node running GS, defaults to False :type is_primary: bool, optional - :return: serialized Node descruptor - :rtype: dict + :param ip_addrs: IP addresses used for backend messaging by transport agents, defaults to ["127.0.0.1"] + :type ip_addrs: Optional[list[str]], optional + :param shep_cd: Channel descriptor for this node's Local Services, defaults to None + :type shep_cd: Optional[str], optional + :param cpu_devices: List of CPUs and IDs on this node, defaults to list(os.sched_getaffinity(0)) + :type cpu_devices: Optional[list[int]], optional + :param accelerators: List of any accelerators available on this node, defaults to find_accelerators() + :type accelerators: Optional[AcceleratorDescriptor], optional """ - state = cls.State.ACTIVE + from dragon.infrastructure import parameters as dparms - huid = get_host_id() # will become h_uid in GS + if host_name is None: + host_name = gethostname() - if name is None: - name = f"Node-{huid}" + if host_id is None: + host_id = get_host_id() + + if cpu_devices is None: + cpu_devices = list(os.sched_getaffinity(0)) + + if accelerators is None: + accelerators = find_accelerators() if ip_addrs is None: ip_addrs = ["127.0.0.1"] + + state = cls.State.ACTIVE + if name is None: + name = f"Node-{host_id}" + + if shep_cd is None: + shep_cd = dparms.this_process.local_shep_cd + num_cpus = os.cpu_count() physical_mem = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") - host_name = gethostname() - desc = cls( - state=state, name=name, ip_addrs=ip_addrs, num_cpus=num_cpus, - physical_mem=physical_mem, is_primary=is_primary, host_id=huid, host_name=host_name - ) - return desc + return cls(state=state, + name=name, + host_name=host_name, + ip_addrs=ip_addrs, + host_id=host_id, + shep_cd=shep_cd, + is_primary=is_primary, + num_cpus=num_cpus, + physical_mem=physical_mem, + cpu_devices=cpu_devices, + accelerators=accelerators) + @classmethod def get_local_node_network_conf(cls, @@ -188,47 +230,6 @@ def get_local_node_network_conf(cls, raise RuntimeError(f'Could not find available port for IP address={ip_addr} in port range {port_range}') - @classmethod - def get_localservices_node_conf(cls, - name: str = "", - host_name: str = '', - host_id: int = None, - ip_addrs: Optional[list[str]] = None, - shep_cd: str = '', - cpu_devices: Optional[list[int]] = None, - accelerators: Optional[AcceleratorDescriptor] = None): - """Return a NodeDescriptor object for Local Services to pass into its SHChannelsUp message - - Populates the values in a NodeDescriptor object that Local Services needs to provide to the - launcher frontend as part of infrastructure bring-up - - :param name: Name for node. Often resorts to hostname, defaults to "" - :type name: str, optional - :param host_name: Hostname for the node, defaults to '' - :type host_name: str, optional - :param host_id: unique host ID of this node, defaults to None - :type host_id: int, optional - :param ip_addrs: IP addresses used for backend messaging by transport agents, defaults to None - :type ip_addrs: list[str], optional - :param shep_cd: Channel descriptor for this node's Local Services, defaults to '' - :type shep_cd: str, optional - :param cpu_devices: List of CPUs and IDs on this node, defaults to None - :type cpu_devices: list[int], optional - :param accelerators: List of any accelerators available on this node, defaults to None - :type accelerators: AcceleratorDescriptor, optional - """ - - from dragon.infrastructure import parameters as dparms - - return cls(state=NodeDescriptor.State.ACTIVE, - name=name, - host_name=host_name, - ip_addrs=ip_addrs, - host_id=get_host_id(), - shep_cd=dparms.this_process.local_shep_cd, - cpu_devices=list(os.sched_getaffinity(0)), - accelerators=find_accelerators()) - @property def sdesc(self): return self.get_sdict() @@ -247,7 +248,8 @@ def get_sdict(self): "physical_mem": self.physical_mem, "shep_cd": self.shep_cd, "overlay_cd": self.overlay_cd, - "cpu_devices": self.cpu_devices + "cpu_devices": self.cpu_devices, + "state": self.state } # Account for a NULL accelerator giving us a None for now @@ -261,4 +263,10 @@ def get_sdict(self): @classmethod def from_sdict(cls, sdict): sdict["state"] = NodeDescriptor.State(sdict["state"]) + try: + if sdict["accelerators"] is not None: + sdict["accelerators"] = AcceleratorDescriptor.from_sdict(sdict["accelerators"]) + except KeyError: + sdict["accelerators"] = None + return NodeDescriptor(**sdict) diff --git a/src/dragon/infrastructure/parameters.py b/src/dragon/infrastructure/parameters.py index 4118f0b..6c72250 100644 --- a/src/dragon/infrastructure/parameters.py +++ b/src/dragon/infrastructure/parameters.py @@ -61,7 +61,7 @@ def check_pool(pool): return pool in {dfacts.PATCHED, dfacts.NATIVE} -def cast_wait_mode(wait_mode): +def cast_wait_mode(wait_mode=dfacts.INFRASTRUCTURE_DEFAULT_WAIT_MODE): if isinstance(wait_mode, dtypes.WaitMode): return wait_mode if wait_mode == "IDLE_WAIT": @@ -92,7 +92,7 @@ def cast_wait_mode(wait_mode): -def cast_return_when_mode(return_when): +def cast_return_when_mode(return_when=dfacts.INFRASTRUCTURE_DEFAULT_RETURN_WHEN_MODE): if isinstance(return_when, dtypes.ReturnWhen): return return_when if return_when == "WHEN_IMMEDIATE": @@ -142,6 +142,7 @@ def check_base64(strdata): return decoded_ok +typecast = lambda ty: lambda val: ty() if val == '' else ty(val) class LaunchParameters: """Launch Parameters for Dragon processes. @@ -381,6 +382,9 @@ def set_num_gateways_per_node(self, num_gateways=dfacts.DRAGON_DEFAULT_NUM_GW_CH LaunchParameters.init_class_vars() this_process = LaunchParameters.from_env() +def reload_this_process(): + global this_process + this_process = LaunchParameters.from_env() class Policy: """Used to encapsulate policy decisions. diff --git a/src/dragon/infrastructure/policy.py b/src/dragon/infrastructure/policy.py index 0a8c2d4..9859f74 100644 --- a/src/dragon/infrastructure/policy.py +++ b/src/dragon/infrastructure/policy.py @@ -71,6 +71,7 @@ class Device(enum.IntEnum): """ Which type of device the affinity policy will apply to """ + CPU = enum.auto() GPU = enum.auto() DEFAULT = enum.auto() @@ -94,6 +95,7 @@ class Affinity(enum.IntEnum): # TODO: Not implemented class WaitMode(enum.IntEnum): """Channel WaitMode type""" + IDLE = enum.auto() SPIN = enum.auto() DEFAULT = enum.auto() diff --git a/src/dragon/infrastructure/util.py b/src/dragon/infrastructure/util.py index e98f4b4..db75cd2 100644 --- a/src/dragon/infrastructure/util.py +++ b/src/dragon/infrastructure/util.py @@ -29,7 +29,8 @@ import logging from warnings import warn from .parameters import this_process -from socket import socket, AF_INET, SOCK_STREAM +from socket import socket, AF_INET, SOCK_STREAM, SOCK_DGRAM, gethostname, inet_aton +import struct from .facts import FIRST_PUID @@ -356,6 +357,7 @@ def survey_dev_shm(): except FileNotFoundError: pass + def compare_dev_shm(previous): """Warns if there are files owned by current user in /dev/shm not previously seen @@ -524,6 +526,17 @@ def port_check(ip_port): return True +def get_port(): + host = gethostname() + min_port = 1025; max_port = 65536 + + for port in range(min_port, max_port + 1): + if port_check((host, port)): + return port + + raise RuntimeError('No available ports') + + def get_host_info(network_prefix) -> tuple[str, str, list[str]]: """Return username, hostname, and list of IP addresses.""" from dragon.transport.ifaddrs import getifaddrs, InterfaceAddressFilter @@ -602,3 +615,31 @@ def isEmpty(self): def clear(self): # Clears the stack of all items, resetting it to an empty stack. self.items = [] + + +# get external IP address +def get_external_ip_addr(): + s = socket(AF_INET, SOCK_DGRAM) + s.settimeout(0) + connected = False + try: + # doesn't even have to be reachable + s.connect(('10.254.254.254', 1)) + connected = True + ip_addr = s.getsockname()[0] + s.close() + return ip_addr + except Exception: + if connected: + s.close + raise + + +def rt_uid_from_ip_addrs(fe_ext_ip_addr, head_node_ip_addr): + fe_ext_packed_ip = inet_aton(fe_ext_ip_addr) + head_node_packed_ip = inet_aton(head_node_ip_addr) + + fe_ext_int = struct.unpack("!L", fe_ext_packed_ip)[0] + head_node_int = struct.unpack("!L", head_node_packed_ip)[0] + + return (fe_ext_int << 32) | head_node_int diff --git a/src/dragon/launcher/backend.py b/src/dragon/launcher/backend.py index 293c561..27c1c81 100644 --- a/src/dragon/launcher/backend.py +++ b/src/dragon/launcher/backend.py @@ -3,8 +3,10 @@ import threading import re import subprocess +import signal from enum import Enum from functools import total_ordering +from time import sleep from ..utils import B64 from ..managed_memory import MemoryPool, DragonPoolError, DragonMemoryError @@ -505,6 +507,20 @@ def _construct_child_forwarding(self, return fwd_conns + def _sigterm_handler(self, *args): + """Handle transmitting AbnormalTermination to frontend if SIGTERM comes from WLM""" + + log = logging.getLogger(dls.LA_BE).getChild('_sigterm_handler') + + # If overlay threads aren't up, wait till they are + while self._state < BackendState.OVERLAY_THREADS_UP: + sleep(0.1) + + # Now send an exceptionless abort which will get an AbnormalTerm to the frontend, + # and then we're done + log.debug('sending exceptionless abort due to SIGTERM signal') + self.la_be_stdout.send(dmsg.ExceptionlessAbort(tag=dlutil.next_tag()).serialize()) + def run_startup(self, arg_ip_addr: str, arg_host_id: str, @@ -525,6 +541,15 @@ def run_startup(self, """ log = logging.getLogger(dls.LA_BE).getChild('run_startup') + # Take control of SIGTERM signal from WLM + try: + self._orig_sigterm = signal.signal(signal.SIGTERM, self._sigterm_handler) + log.debug("got sigterm signal handling in place") + except ValueError: + # this error is thrown if we are running inside a child thread + # which we do for unit tests. So pass on this + log.debug("Unable to do signal handling outside of main thread") + net_conf = NodeDescriptor.get_local_node_network_conf() self.host_id = int(net_conf.host_id) self.hostname = net_conf.name @@ -713,7 +738,6 @@ def run_startup(self, assert isinstance(sh_ping_be_msg, dmsg.SHPingBE), 'la_be ping from ls expected' log.info('la_be recv SHPingBE - m3') - # switch to comms with local services over channels and proceed with bring up self.ls_channel = Channel.attach(B64.from_str(sh_ping_be_msg.shep_cd).decode()) self.la_channel = Channel.attach(B64.from_str(sh_ping_be_msg.be_cd).decode()) @@ -796,7 +820,7 @@ def send_messages_to_overlaynet(self, la_be_stdout: dlutil.SRQueue): # Handle the case that we're able to catch an abort and need to send # that to the frontend but then assume the frontend will no longer talk to us. if isinstance(msg, dmsg.ExceptionlessAbort): - self.infra_out.send(dmsg.AbnormalTermination(tag=dlutil.next_tag()).serialize()) + self.infra_out.send(dmsg.AbnormalTermination(tag=dlutil.next_tag(), host_id=self.host_id).serialize()) self._abnormally_terminating = True self.to_overlaynet_log.info('forwarded AbnormalTermination to frontend via ExceptionlessAbort') @@ -921,7 +945,7 @@ def forward_to_leaves(self, msg): """Forward an infrastructure messages to leaves I'm responsible for :param msg: Infrastructure message - :type msg: dragon.infrastructure.messages._MsgBase + :type msg: dragon.infrastructure.messages.InfraMsg """ for conn in self.frontend_fwd_conns.values(): conn.send(msg.serialize()) diff --git a/src/dragon/launcher/dragon_multi_fe.py b/src/dragon/launcher/dragon_multi_fe.py index 42a9e40..a82932b 100755 --- a/src/dragon/launcher/dragon_multi_fe.py +++ b/src/dragon/launcher/dragon_multi_fe.py @@ -4,17 +4,19 @@ import os import logging -from .frontend import LauncherFrontEnd, LAUNCHER_FAIL_EXIT +from .frontend import LauncherFrontEnd, LAUNCHER_FAIL_EXIT, LAUNCHER_SUCCESS_EXIT from .launchargs import get_args as get_cli_args from ..utils import set_procname, set_host_id, host_id from ..dlogging.util import setup_FE_logging, DragonLoggingServices as dls from ..infrastructure.facts import PROCNAME_LA_FE, FRONTEND_HOSTID +from ..infrastructure.node_desc import NodeDescriptor -def main(): +def main(args_map=None): - args_map = get_cli_args() + if args_map is None: + args_map = get_cli_args() setup_FE_logging(log_device_level_map=args_map['log_device_level_map'], basename='dragon', basedir=os.getcwd()) @@ -29,16 +31,45 @@ def main(): if value is not None: log.info(f'args_map: {key}: {value}') - with LauncherFrontEnd(args_map=args_map) as fe_server: + execution_complete = False + net_conf = None + + while not execution_complete: + # Try to run the launcher + try: - fe_server.run_startup() - fe_server.run_app() - fe_server.run_msg_server() + with LauncherFrontEnd(args_map=args_map) as fe_server: + net_conf = fe_server.run_startup(net_conf=net_conf) + net_conf = fe_server.run_app() + net_conf = fe_server.run_msg_server() + + # Handle an obvious exception as well as what to do if we're trying a resilient runtime except Exception as err: log.exception(f'Error in launcher frontend: {err}') - return LAUNCHER_FAIL_EXIT + if not fe_server.resilient: + return LAUNCHER_FAIL_EXIT + + # Check if the sum of active and idle nodes is > 0: + avail_nodes = len([idx for idx, node in net_conf.items() + if node.state in [NodeDescriptor.State.ACTIVE, NodeDescriptor.State.IDLE] and idx !='f']) + log.info(f'avail nodes found to be {avail_nodes}') + + # Proceed + if args_map['exhaust_resources']: + if avail_nodes == 0: + print("There are no more hardware resources available for continued app execution.") + return LAUNCHER_FAIL_EXIT + elif avail_nodes == args_map['node_count'] - 1: + print("There are not enough hardware resources available for continued app execution.") + return LAUNCHER_FAIL_EXIT + + + # If everything exited wtihout exception, break out of the loop and exit + else: + execution_complete = True log.info("exiting frontend") + return LAUNCHER_SUCCESS_EXIT if __name__ == "__main__": diff --git a/src/dragon/launcher/dragon_single.py b/src/dragon/launcher/dragon_single.py index fb47fe1..573b995 100755 --- a/src/dragon/launcher/dragon_single.py +++ b/src/dragon/launcher/dragon_single.py @@ -113,6 +113,16 @@ def shutdown_monitor(la_in): def main(): arg_map = launchargs.get_args() + try: + runtime_ip_addr = dutil.get_external_ip_addr().split(':')[0] + except OSError: + runtime_ip_addr = None + + if runtime_ip_addr is not None: + os.environ['DRAGON_FE_EXTERNAL_IP_ADDR'] = runtime_ip_addr + os.environ['DRAGON_HEAD_NODE_IP_ADDR'] = runtime_ip_addr + os.environ['DRAGON_RT_UID'] = str(dutil.rt_uid_from_ip_addrs(runtime_ip_addr, runtime_ip_addr)) + dlog.setup_FE_logging(log_device_level_map=arg_map['log_device_level_map'], basename='dragon', basedir=os.getcwd()) diff --git a/src/dragon/launcher/frontend.py b/src/dragon/launcher/frontend.py index 8dd1823..126cd80 100644 --- a/src/dragon/launcher/frontend.py +++ b/src/dragon/launcher/frontend.py @@ -8,6 +8,7 @@ from enum import Enum from shlex import quote from functools import total_ordering +from typing import Optional from ..utils import B64 from ..channels import Channel, ChannelError, ChannelEmpty, register_gateways_from_env, discard_gateways @@ -18,7 +19,7 @@ from ..dlogging.util import _get_dragon_log_device_level, LOGGING_OUTPUT_DEVICE_DRAGON_FILE from ..dlogging.logger import DragonLogger, DragonLoggingError -from ..infrastructure.util import route +from ..infrastructure.util import route, get_external_ip_addr, rt_uid_from_ip_addrs from ..infrastructure.parameters import POLICY_INFRASTRUCTURE, this_process from ..infrastructure.connection import Connection, ConnectionOptions from ..infrastructure.node_desc import NodeDescriptor @@ -33,6 +34,7 @@ from .wlm.ssh import SSHSubprocessPopen LAUNCHER_FAIL_EXIT = 1 +LAUNCHER_SUCCESS_EXIT = 0 class LauncherImmediateExit(Exception): @@ -139,6 +141,7 @@ def __init__(self, args_map, sigint_trigger=None): self.args_map = args_map self.nnodes = args_map.get('node_count', 0) + self.n_idle = args_map.get('idle_count', 0) self.ntree_nodes = this_process.overlay_fanout self.network_prefix = args_map.get('network_prefix', dfacts.DEFAULT_TRANSPORT_NETIF) self.port = args_map.get('port', dfacts.DEFAULT_TRANSPORT_PORT) @@ -164,9 +167,18 @@ def __init__(self, args_map, sigint_trigger=None): When using SSH to execute dragon jobs, the TCP transport agent is the only allowed agent. Please resubmit your dragon launch command with the `--transport tcp` option set. ''' - print(msg) + print(msg, flush=True) sys.exit(LAUNCHER_FAIL_EXIT) + # Handle some sanity checks on the resilient mode + + # If using resilient mode, confirm --nodes or --idle is set. + # Checks on a sane node count occur in the frontend code + self.resilient = args_map.get('resilient', False) + if self.resilient: + if self.nnodes == 0 and self.n_idle == 0: + raise RuntimeError("resilient flag requires setting of '--nodes' or '--idle'") + # Variety of other state trackers: self._sigint_count = 0 self._sigint_timeout = 5.0 @@ -205,7 +217,10 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): + log = logging.getLogger(dls.LA_FE).getChild('_cleanup') + log.debug('doing __exit__ cleanup') self._cleanup() + log.debug('exiting frontend via __exit__') def _kill_backend(self): '''Simple function for transmitting SIGKILL to backend with helpful message''' @@ -506,10 +521,10 @@ def _dragon_cleanup_bumpy_exit(self): # Make sure we don't manage to call this more than once self._bumpy_exit.clear() - def sigint_handler(self, *args): + def _sigint_handler(self, *args): """Handler for SIGINT signals for graceful teardown """ - log = logging.getLogger('sigint_handler') + log = logging.getLogger('_sigint_handler') log.debug('Entered sigint handler') self.sigint_log = logging.getLogger('sigint_route') self._sigint_count = self._sigint_count + 1 @@ -526,6 +541,25 @@ def sigint_handler(self, *args): else: log.warning(f'SIGINT detected in {self._STATE} and no routing exists for it') + + def _sighup_handler(self, *args): + log = logging.getLogger('_sighup_handler') + # needed for logging of set_quick_teardown path + self.sigint_log = logging.getLogger('sighup_route') + self._sigint_count = 2 + + print("Frontend detected SIGHUP or SIGTERM from WLM. Attempting to clean up...", flush=True) + log.debug('caught sighup or sigterm on frontend') + self._set_quick_teardown() + + if self._STATE.value in LauncherFrontEnd._STBL: + self._STBL[self._STATE.value][0](self) + else: + log.warning(f'SIGHUP or SIGTERM detected in {self._STATE} and no routing exists for it') + + def _sigterm_handler(self, *args): + self._dragon_cleanup_bumpy_exit() + @route(FrontendState.NET_CONFIG.value, _STBL) def _sigint_net_config(self): # if we're here, we should be able to just exit @@ -694,13 +728,19 @@ def construct_bcast_tree(self, net_conf, conn_policy, be_ups, frontend_sdesc): # Send out the FENodeIdx to the child nodes I own conn_outs = {} # key is the node_index and value is the Connection object - fe_node_idx = dmsg.FENodeIdxBE(tag=dlutil.next_tag(), - node_index=0, - forward=forwarding, - send_desc=frontend_sdesc) - log.debug(f'fanout = {this_process.overlay_fanout}') - for idx in range(this_process.overlay_fanout): - if idx < self.nnodes: + fe_node_idx_msg = dmsg.FENodeIdxBE(tag=dlutil.next_tag(), + node_index=0, + forward=forwarding, + send_desc=frontend_sdesc) + + # Create a counter that increments to ensure there's an increment of 1 for each + # active node index. Otherwise, there may be problems in the bcast algorithm + fe_node_index = 0 + for idx in range(len(net_conf) - 1): + + # If we haven't grabbed enough nodes, do so and make sure they're active + if fe_node_index < this_process.overlay_fanout and net_conf[str(idx)].state is NodeDescriptor.State.ACTIVE: + log.debug(f'constructing FENodeIdxBE for {net_conf[str(idx)]} (idx = {idx} | fe_node_index = {fe_node_index})') try: be_sdesc = B64.from_str(forwarding[str(idx)].overlay_cd) be_ch = Channel.attach(be_sdesc.decode(), mem_pool=self.fe_mpool) @@ -711,28 +751,160 @@ def construct_bcast_tree(self, net_conf, conn_policy, be_ups, frontend_sdesc): conn_out.ghost = True # Update the node index to the one we're talking to - fe_node_idx.node_index = idx - log.debug(f'sending {fe_node_idx.uncompressed_serialize()}') - conn_out.send(fe_node_idx.serialize()) + fe_node_idx_msg.node_index = fe_node_index + log.debug(f'sending {fe_node_idx_msg.uncompressed_serialize()}') + conn_out.send(fe_node_idx_msg.serialize()) - conn_outs[idx] = conn_out + conn_outs[fe_node_index] = conn_out + fe_node_index = fe_node_index + 1 except ChannelError as ex: log.fatal(f'could not connect to BE channel with host_id {be_up.host_id}') raise RuntimeError('Connection with BE failed') from ex - else: - break log.info('sent all FENodeIdxBE msgs') return conn_outs - def run_startup(self): + def _set_node_to_down_state(self, + host_id: int): + """Given a host ID, update the Net conf, setting the node to a down state + + :param host_id: host ID of down node + :type host_id: int + """ + + log = logging.getLogger(dls.LA_FE).getChild('_set_node_to_down_state') + + for index, node in self.net_conf.items(): + if node.host_id == host_id: + log.debug(f'setting node index {index}, hostname {node.host_name} to down') + node.state = NodeDescriptor.State.DOWN + + def _define_node_pools(self, + net_conf: dict): + """Make net config match what we know about our node pools + + :param net_conf: backend network configuration made of NodeDescriptor objects index by string integer + :type net_conf: dict + """ + + log = logging.getLogger(dls.LA_FE).getChild('_define_node_pools') + + # Work out logic for our node counts: + nnodes = len(net_conf) - 1 # Remove the frontend from consideration + all_avail_nodes = nnodes + log.debug(f'requested {self.nnodes} and got {nnodes}') + if self.nnodes > 0: + if self.nnodes > nnodes: + log.exception('too many nodes requested') + raise ValueError('Not enough backend nodes allocated to match requested') + nnodes = self.nnodes + else: + self.nnodes = nnodes + + # If doing resilient training, do some sanity checks on requested # nodes, idle nodes + if self.resilient: + # Confirm the number of idle nodes requested agrees with nnodes + if self.n_idle != 0: + if all_avail_nodes - (self.nnodes + self.n_idle) < 0: + msg = f"Sum of requested active ({self.nnodes}) and idle ({self.n_idle}) nodes is greater than available ({all_avail_nodes})" + raise RuntimeError(msg) + else: + self.n_idle = all_avail_nodes - self.nnodes + log.debug(f"Executing resilient mode with {self.n_idle} idle nodes") + + # Make sure the number of active/idle nodes matches what's been requested + n_active = len([node for index, node in net_conf.items() + if index != 'f' and node.state is NodeDescriptor.State.ACTIVE]) + + n_down = len([node for index, node in net_conf.items() + if index != 'f' and node.state is NodeDescriptor.State.DOWN]) + log.debug(f'currently have {n_active} active nodes against {n_down} down and {self.n_idle} idle') + if n_active != self.nnodes or n_down != 0: + current_active = 0 + for index, node in net_conf.items(): + if index == 'f': + continue + elif current_active != self.nnodes and node.state is not NodeDescriptor.State.DOWN: + if node.state in [NodeDescriptor.State.IDLE, NodeDescriptor.State.ACTIVE]: + node.state = NodeDescriptor.State.ACTIVE + current_active = current_active + 1 + elif node.state != NodeDescriptor.State.DOWN: + node.state = NodeDescriptor.State.IDLE + + log.debug(f'Updated active list has {current_active} active nodes') + # Make sure the number of active nodes is not 0 + if current_active == 0: + raise RuntimeError('No available backend hardware resources to use') + + # Log if there are fewer than requested nodes available because too many have been + # marked as down + if current_active != self.nnodes: + self.nnodes = current_active + msg = '''There are fewer available backend nodes than requested. +Will continue using the available nodes until all resources are exhausted. +Performance may be suboptimal.''' + log.warning(msg) + + # Make sure there is a designated primary node in the configuration and select one if there isn't + primary_election = [index for index, node in net_conf.items() + if node.state is NodeDescriptor.State.ACTIVE and node.is_primary] + + # Make sure there is just 1 primary node in case something screwy happened + if len(primary_election) != 1: + prim_set = False + for node in net_conf.values(): + if node.state is NodeDescriptor.State.ACTIVE and not prim_set: + log.debug(f'node {node} is now primary') + node.is_primary = True + prim_set = True + elif node.is_primary and prim_set: + node.is_primary = False + + return net_conf + + def _define_overlay_network(self, + net_conf: dict, + fe_host_id: int, + fe_ip_addr: str): + """Extract backend IP addresses, hostnames, and host IDs for overlay network comms + + :param net_conf: Network configuration dict of NodeDescriptors keyed by string node index + :type net_conf: dict + :param fe_host_id: Host ID of frontend node + :type fe_host_id: int + :param fe_ip_addr: IP address of frontend node + :type fe_ip_addr: str + """ + + # Add as many as needed to meet the requested node count. + # We also find the minimum number of network interface cards + # per node. + min_nics_per_node = 99999 + host_ids = [fe_host_id] + ip_addrs = [fe_ip_addr] + hostnames = [] + + for index, node in net_conf.items(): + if index != 'f' and node.state is NodeDescriptor.State.ACTIVE: + min_nics_per_node = min(min_nics_per_node, len(net_conf[index].ip_addrs)) + host_ids.append(str(net_conf[index].host_id)) + ip_addrs.append(net_conf[index].ip_addrs[0]) + hostnames.append(net_conf[index].host_name) + + return host_ids, ip_addrs, hostnames, min_nics_per_node + + def run_startup(self, + net_conf: Optional[dict] = None): """Complete bring up of runtime services + + + :param net_conf: Net config for overlay network. Used if resiliency has been requested at initializationq, defaults to None + :type net_conf: dict, optional """ log = logging.getLogger(dls.LA_FE).getChild('run_startup') - # This is set here for the overlay network. this_process.set_num_gateways_per_node(dfacts.DRAGON_OVERLAY_DEFAULT_NUM_GW_CHANNELS_PER_NODE) @@ -740,8 +912,16 @@ def run_startup(self): self._STATE = FrontendState.NET_CONFIG try: - self._orig_sigint = signal.signal(signal.SIGINT, self.sigint_handler) + # Catch ctrl+c events + self._orig_sigint = signal.signal(signal.SIGINT, self._sigint_handler) + + # Catch WLM killing off the backend + self._orig_sighup = signal.signal(signal.SIGHUP, self._sighup_handler) + + self._orig_sigterm = signal.signal(signal.SIGTERM, self._sigterm_handler) + log.debug("got signal handling in place") + except ValueError: # this error is thrown if we are running inside a child thread # which we do for unit tests. So pass on this @@ -750,37 +930,56 @@ def run_startup(self): self.la_fe_stdin = dlutil.OverlayNetLaFEQueue() self.la_fe_stdout = dlutil.LaOverlayNetFEQueue() - # Get the node config for the backend - log.debug('Getting the node config for the backend.') - if self._config_from_file is not None: - try: - log.info(f"Acquiring network config from file {self._config_from_file}") - self.net = NetworkConfig.from_file(self._config_from_file) - except Exception: - raise RuntimeError("Unable to acquire backend network configuration from input file.") + # If we have the config via an earlier frontend, don't do it all over again + if net_conf is None: + # Get the node config for the backend + log.debug('Getting the node config for the backend.') + + if self._config_from_file is not None: + try: + log.info(f"Acquiring network config from file {self._config_from_file}") + self.net = NetworkConfig.from_file(self._config_from_file) + except Exception: + raise RuntimeError("Unable to acquire backend network configuration from input file.") + else: + try: + log.info("Acquiring network config via WLM queries") + # This sigint trigger is -2 and -1 cases + self.net = NetworkConfig.from_wlm(workload_manager=self._wlm, + port=dfacts.DEFAULT_OVERLAY_NETWORK_PORT, + network_prefix=dfacts.DEFAULT_TRANSPORT_NETIF, + hostlist=self.hostlist, + sigint_trigger=self._sigint_trigger) + except Exception: + raise RuntimeError("Unable to acquire backend network configuration via workload manager") + + self.net_conf = self.net.get_network_config() else: - try: - log.info("Acquiring network config via WLM queries") - # This sigint trigger is -2 and -1 cases - self.net = NetworkConfig.from_wlm(workload_manager=self._wlm, - port=dfacts.DEFAULT_OVERLAY_NETWORK_PORT, - network_prefix=dfacts.DEFAULT_TRANSPORT_NETIF, - hostlist=self.hostlist, - sigint_trigger=self._sigint_trigger) - except Exception: - raise RuntimeError("Unable to acquire backend network configuration via workload manager") - net_conf = self.net.get_network_config() - log.debug(f"net_conf = {net_conf}") + self.net_conf = net_conf if self._sigint_trigger == 0: signal.raise_signal(signal.SIGINT) # Add the frontend config - net_conf['f'] = NodeDescriptor.get_local_node_network_conf(network_prefix=self.network_prefix, - port_range=dfacts.DEFAULT_FRONTEND_PORT) - fe_host_id = str(net_conf['f'].host_id) - fe_ip_addr = net_conf['f'].ip_addrs[0] # it includes the port - log.debug(f'node config: {net_conf}') + self.net_conf['f'] = NodeDescriptor.get_local_node_network_conf(network_prefix=self.network_prefix, + port_range=dfacts.DEFAULT_FRONTEND_PORT) + fe_host_id = str(self.net_conf['f'].host_id) + fe_ip_addr = self.net_conf['f'].ip_addrs[0] # it includes the port + log.debug(f'network config: {self.net_conf}') + + # this will raise an OSError when the frontend is run on a compute node w/o external access + try: + fe_ext_ip_addr = get_external_ip_addr().split(':')[0] + except OSError: + fe_ext_ip_addr = None + + # this will exist even w/o external access + head_node_ip_addr = self.net_conf['0'].ip_addrs[0].split(':')[0] + os.environ['DRAGON_HEAD_NODE_IP_ADDR'] = head_node_ip_addr + + if fe_ext_ip_addr is not None: + os.environ['DRAGON_FE_EXTERNAL_IP_ADDR'] = fe_ext_ip_addr + os.environ['DRAGON_RT_UID'] = str(rt_uid_from_ip_addrs(fe_ext_ip_addr, head_node_ip_addr)) # Create my memory pool conn_options = ConnectionOptions(min_block_size=2 ** 16) @@ -838,40 +1037,17 @@ def run_startup(self): os.environ[dfacts.GW_ENV_PREFIX + str(dfacts.DRAGON_OVERLAY_DEFAULT_NUM_GW_CHANNELS_PER_NODE)] = encoded_ser_gw_str register_gateways_from_env() - # start my transport agent - nnodes = len(net_conf) - 1 # Exclude the frontend node from this - log.debug(f'requested {self.nnodes} and got {nnodes}') - if self.nnodes > 0: - if self.nnodes > nnodes: - log.exception('too many nodes requested') - raise ValueError('Not enough backend nodes allocated to match requested') - nnodes = self.nnodes - else: - self.nnodes = nnodes - - log.debug(f'main has {nnodes} nodes') + # If we have any mods we need to make to the net_conf, do it now + self.net_conf = self._define_node_pools(self.net_conf) # Acquire the primary node and add # the frontend info to host_ids and ip_addrs, so the TA functions - host_ids = [str(net_conf['0'].host_id)] + [fe_host_id] - ip_addrs = [net_conf['0'].ip_addrs[0]] + [fe_ip_addr] - hostnames = [net_conf['0'].name] + host_ids, ip_addrs, hostnames, min_nics_per_node = self._define_overlay_network(self.net_conf, + fe_host_id, + fe_ip_addr) - # Add as many as needed to meet the requested node count. - # We also find the minimum number of network interface cards - # per node. - min_nics_per_node = 99999 - for node_index in range(nnodes): - sindex = str(node_index) - min_nics_per_node = min(min_nics_per_node, len(net_conf[sindex].ip_addrs)) - if not net_conf[sindex].is_primary: - host_ids.append(str(net_conf[sindex].host_id)) - ip_addrs.append(net_conf[sindex].ip_addrs[0]) - hostnames.append(net_conf[sindex].host_name) - - log.debug(f"ip_addrs={ip_addrs}, host_ids={host_ids}") + log.debug(f"ip_addrs={ip_addrs}, host_ids={host_ids}, hostnames={hostnames}") log.debug(f'Found {min_nics_per_node} NICs per node.') - log.debug(f'standing up tcp agent with gw: {encoded_ser_gw_str}') self._STATE = FrontendState.OVERLAY_STARTING @@ -932,7 +1108,7 @@ def run_startup(self): self._STATE = FrontendState.STARTUP try: - self.wlm_proc = self._launch_backend(nnodes=nnodes, + self.wlm_proc = self._launch_backend(nnodes=self.nnodes, nodelist=hostnames, fe_ip_addr=fe_ip_addr, fe_host_id=fe_host_id, @@ -950,14 +1126,14 @@ def run_startup(self): signal.raise_signal(signal.SIGINT) # Receive BEIsUp msg - Try getting a backend channel descriptor - be_ups = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(nnodes)] + be_ups = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(self.nnodes)] assert len(be_ups) == self.nnodes # Construct the number of backend connections based on # the hierarchical bcast info and send FENodeIdxBE to those # nodes - log.info(f'received {nnodes} BEIsUp msgs') - self.conn_outs = self.construct_bcast_tree(net_conf, + log.info(f'received {self.nnodes} BEIsUp msgs') + self.conn_outs = self.construct_bcast_tree(self.net_conf, conn_policy, be_ups, encoded_inbound_str) @@ -966,12 +1142,12 @@ def run_startup(self): chs_up = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(self.nnodes)] for ch_up in chs_up: assert isinstance(ch_up, dmsg.SHChannelsUp), 'la_fe received invalid channel up' - log.info(f'received {nnodes} SHChannelsUP msgs') + log.info(f'received {self.nnodes} SHChannelsUP msgs') nodes_desc = {ch_up.idx: ch_up.node_desc for ch_up in chs_up} gs_cds = [ch_up.gs_cd for ch_up in chs_up if ch_up.gs_cd is not None] if len(gs_cds) == 0: - print('The Global Services CD was not returned by any of the SHChannelsUp messages. Launcher Exiting.') + print('The Global Services CD was not returned by any of the SHChannelsUp messages. Launcher Exiting.', flush=True) sys.exit(LAUNCHER_FAIL_EXIT) gs_cd = gs_cds[0] @@ -1014,10 +1190,10 @@ def run_startup(self): self.la_fe_stdout.send("A", la_ch_info.serialize()) log.info('sent LACHannelsInfo to overlaynet fe') - self.tas_up = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(nnodes)] + self.tas_up = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(self.nnodes)] for ta_up in self.tas_up: assert isinstance(ta_up, dmsg.TAUp), 'la_fe received invalid channel up' - log.info(f'received {nnodes} TAUp messages') + log.info(f'received {self.nnodes} TAUp messages') if self._sigint_trigger == 6: signal.raise_signal(signal.SIGINT) @@ -1031,6 +1207,8 @@ def run_startup(self): # Infrastructure is up self._STATE = FrontendState.STOOD_UP + return self.net_conf + def run_app(self): """Start user app execution via GSProcessCreate or SHProcessCreate """ @@ -1058,6 +1236,8 @@ def run_app(self): signal.raise_signal(signal.SIGINT) log.info('transmitted GSProcessCreate') + return self.net_conf + def run_msg_server(self): """Process messages from backend after user app starts """ @@ -1113,6 +1293,8 @@ def run_msg_server(self): # Set state so our exit method correctly executes. self._STATE = FrontendState.LAUNCHER_DOWN + return self.net_conf + def probe_teardown(self): """Check on whether to begin teardown based on received backend messages""" # Global services is up and we're using it @@ -1143,7 +1325,7 @@ def handle_sh_fwd_output(self, msg: dmsg.SHFwdOutput): msg_out = self.build_stdmsg(msg, self.args_map, msg.fd_num == dmsg.SHFwdOutput.FDNum.STDOUT.value) self.msg_log.debug(f'{msg}') - print(msg_out, end="") + print(msg_out, end="", flush=True) @route(dmsg.LAExit, _DTBL) def handle_la_exit(self, msg: dmsg.LAExit): @@ -1280,18 +1462,18 @@ def send_msg_to_overlaynet(self, target, msg): Args: target (str): "A" for all backend nodes. "P" for primary node only - msg (dmsg._MsgBase): non-serialized message to send + msg (dmsg.InfraMsg): non-serialized message to send """ try: self.la_fe_stdout.send(target, msg.serialize()) except Exception: raise RuntimeError("Unable to send message to overlaynet send thread") - def _overlay_bcast(self, msg: dmsg._MsgBase): + def _overlay_bcast(self, msg: dmsg.InfraMsg): '''Send bcast of message to all backend nodes via overlay network :param msg: Message to send - :type msg: dmsg._MsgBase + :type msg: dmsg.InfraMsg ''' for conn_out in self.conn_outs.values(): @@ -1360,11 +1542,17 @@ def recv_msgs_from_overlaynet(self, # the get_with_timeout is a decorated function (i.e. wrapped function # in a function) that filters out all the log messages coming up through # overlaynet. So we don't worry about them here. See dlutil for details. - try: - be_msg = dlutil.get_with_blocking(self.conn_in) - except AbnormalTerminationError: + be_msg = dlutil.get_with_blocking_frontend_server(self.conn_in) + + if isinstance(be_msg, dmsg.AbnormalTermination): # I need to get this to the main thread without throwing an exception self._abnormal_termination.set() + + # Update the network configuration to reflect the down state of whatever node + # we got this error from + log.debug(f'Abort found from backend node: {be_msg.host_id}') + self._set_node_to_down_state(be_msg.host_id) + if self._STATE < FrontendState.TEARDOWN: be_msg = dmsg.ExceptionlessAbort(tag=dlutil.next_tag()) else: diff --git a/src/dragon/launcher/launchargs.py b/src/dragon/launcher/launchargs.py index b73fd0c..e3c2803 100644 --- a/src/dragon/launcher/launchargs.py +++ b/src/dragon/launcher/launchargs.py @@ -97,6 +97,19 @@ backend node-to-node communication. By default, the TCP transport agent is selected. Currently supported agents are: {', '.join([ta.value for ta in TransportAgentOptions])}''' +RESILIENT_HELP = '''If used, the Dragon runtime will attempt to continue execution of the +user app in the event of a hardware or user software error by falling back to functional +hardware resources and omitting hardware where the given error occurred.''' + +IDLE_HELP = '''In conjuction with the --resilient flag, the specifies the number of nodes +that will be held in reserve when the user application is run. In the event a node executing +the user application experiences an error, the Dragon runtime will pull an "idle" node into the +compute pool and begin executing the user application on it.''' + +EXHAUST_HELP = '''When used with --resilient execution, the Dragon runtime will continue executing +the user application in the event of any number of localized hardware errors until there are 0 +nodes available for computation. If not used, the default behavior of executing until the number +of nodes available is less than those requested via the --nodes argument''' class SplitArgsAtComma(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): @@ -158,8 +171,6 @@ def get_parser(): parser = argparse.ArgumentParser(prog='dragon', description='Dragon Launcher Arguments and Options') - parser.add_argument('-N', '--nodes', metavar='NODE_COUNT', dest='node_count', - type=non_negative_int, help=NODES_HELP) host_group = parser.add_mutually_exclusive_group() host_group.add_argument('--hostlist', action=SplitArgsAtComma, metavar='HOSTLIST', type=str, help=HOSTLIST_HELP) @@ -182,6 +193,15 @@ def get_parser(): parser.add_argument('-l', '--log-level', nargs=1, default=dict(), action=dlogutil.LoggingValue, dest='log_device_level_map', metavar='LOG_LEVEL', help=LOGGING_HELP) + + parser.add_argument('-r', '--resilient', action='store_true', help=RESILIENT_HELP) + parser.add_argument('-N', '--nodes', metavar='NODE_COUNT', dest='node_count', + type=non_negative_int, help=NODES_HELP) + parser.add_argument('-i', '--idle', metavar='IDLE_COUNT', dest='idle_count', + type=non_negative_int, help=IDLE_HELP) + parser.add_argument('-e', '--exhaust-resources', action='store_true', help=EXHAUST_HELP) + + parser.add_argument('--no-label', action='store_true', default=True) parser.add_argument('--basic-label', action='store_true') parser.add_argument('--verbose-label', action='store_true') diff --git a/src/dragon/launcher/network_config.py b/src/dragon/launcher/network_config.py index 33390a0..16a6e45 100644 --- a/src/dragon/launcher/network_config.py +++ b/src/dragon/launcher/network_config.py @@ -4,8 +4,6 @@ import json import enum import zlib -import signal -import sys from typing import Optional from base64 import b64encode, b64decode diff --git a/src/dragon/launcher/util.py b/src/dragon/launcher/util.py index cccb88a..62abe5e 100644 --- a/src/dragon/launcher/util.py +++ b/src/dragon/launcher/util.py @@ -134,6 +134,47 @@ def wrapper(*args, **kwargs): return wrapper +def no_error_queue_monitor(func: Callable, *, log_test_queue=None): + """Decorator to pull logging or abnormal messages out launch communications queues + + Take messages out of callback queue and puts into a separate logging queue to be dealt + with later, thereby allowing us to immediately handle infrastructure + + Args: + func (Callable): Callable that emits messages + + log_test_queue (queue, optional): Queue to drop log messages into. Used for internal + unit testing. Defaults to None. + + Returns: + function: The decorator wrapper function + """ + + if func is None: + return partial(queue_monitor, log_test_queue=log_test_queue) + + @wraps(func) + def wrapper(*args, **kwargs): + while True: + msg = func(*args, **kwargs) + if isinstance(msg, dmsg.LoggingMsgList): + if log_test_queue: + log_test_queue.put(msg) + else: + for record in msg.records: + log = logging.getLogger(record.name) + log.log(record.level, record.msg, extra=record.get_logging_dict()) + elif isinstance(msg, dmsg.LoggingMsg): + if log_test_queue: + log_test_queue.put(msg) + else: + log = logging.getLogger(msg.name) + log.log(msg.level, msg.msg, extra=msg.get_logging_dict()) + else: + return msg + return wrapper + + @queue_monitor def get_with_timeout(handle, timeout=TIMEOUT_PATIENCE): """Function for getting messages from a queue given a timeout""" @@ -168,6 +209,19 @@ def get_with_blocking(handle): raise +@no_error_queue_monitor +def get_with_blocking_frontend_server(handle): + """Function for getting messages from the queue while blocking""" + try: + msg = handle.recv() + if isinstance(msg, tuple): + return msg + else: + return dmsg.parse(msg) + except Exception: + raise + + class LaOverlayNetFEQueue(queue.SimpleQueue): """Class for sending messages between launcher and OverlayNet threads""" diff --git a/src/dragon/launcher/wlm/ssh.py b/src/dragon/launcher/wlm/ssh.py index 052c1f8..ddbe9f4 100644 --- a/src/dragon/launcher/wlm/ssh.py +++ b/src/dragon/launcher/wlm/ssh.py @@ -249,6 +249,9 @@ def __init__(self, network_prefix, port, hostlist): def check_for_wlm_support(cls) -> bool: return shutil.which("ssh") + def _get_wlm_job_id(self) -> str: + raise RuntimeError('SSHNetworkConfig does not implement _get_wlm_job_id') + def _supports_net_conf_cache(self) -> bool: return False diff --git a/src/dragon/localservices/local_svc.py b/src/dragon/localservices/local_svc.py index 2b19e37..5cabe6c 100644 --- a/src/dragon/localservices/local_svc.py +++ b/src/dragon/localservices/local_svc.py @@ -65,7 +65,8 @@ def maybe_start_gs(gs_args: Optional[list], gs_env: Optional[dict], hostname: st ProcessProps(p_uid=dfacts.GS_PUID, critical=True, r_c_uid=None, stdin_req=None, stdout_req=None, stderr_req=None, stdin_connector=None, stdout_connector=stdout_connector, - stderr_connector=stderr_connector, layout=None), + stderr_connector=stderr_connector, layout=None, + local_cuids=set()), gs_args, bufsize=0, stdin=subprocess.PIPE, @@ -196,9 +197,6 @@ def single(make_infrastructure_resources: bool = True, assert msg.node_idx == 0, 'single node' node_index = msg.node_idx - node_sdesc = NodeDescriptor.make_for_current_node(is_primary=True).sdesc - log.info(f"Node: {str(node_sdesc)}") - if make_infrastructure_resources: assert not any((gs_input, la_input, shep_input, ta_input)) start_pools, start_channels, shep_input, la_input, ta_input_descr, ta_input, gs_input = mk_inf_resources(node_index) @@ -219,7 +217,8 @@ def single(make_infrastructure_resources: bool = True, ls_node_desc = NodeDescriptor.get_localservices_node_conf(host_name='localhost', name='localhost', - ip_addrs=['127.0.0.1']) + ip_addrs=['127.0.0.1'], + is_primary=True) ch_up_msg = dmsg.SHChannelsUp(tag=get_new_tag(), node_desc=ls_node_desc, gs_cd=dparms.this_process.gs_cd) @@ -235,7 +234,7 @@ def single(make_infrastructure_resources: bool = True, LocalServer.clean_pools(start_pools, log) raise RuntimeError('startup fatal error') from rte - gs_input.send(dmsg.SHPingGS(tag=get_new_tag(), node_sdesc=node_sdesc).serialize()) + gs_input.send(dmsg.SHPingGS(tag=get_new_tag(), node_sdesc=ls_node_desc.sdesc).serialize()) server = LocalServer(channels=start_channels, pools=start_pools, hostname='localhost') @@ -300,8 +299,6 @@ def multinode(make_infrastructure_resources: bool = True, setup_BE_logging(service=dls.LS, logger_sdesc=logger_sdesc, fname=fname) log.debug(f'dragon logging initiated on pid={os.getpid()}') - node_sdesc = NodeDescriptor.make_for_current_node(ip_addrs=ip_addrs, name=hostname, is_primary=is_primary).sdesc - if make_infrastructure_resources: assert not any((gs_input, la_input, ls_input, ta_input)) start_pools, start_channels, ls_input, la_input, ta_input_descr, ta_input, gs_input = mk_inf_resources(node_index) @@ -329,7 +326,8 @@ def multinode(make_infrastructure_resources: bool = True, # Create a node descriptor for this node I'm running on ls_node_desc = NodeDescriptor.get_localservices_node_conf(host_name=hostname, name=hostname, - ip_addrs=ip_addrs) + ip_addrs=ip_addrs, + is_primary=is_primary) ch_up_msg = dmsg.SHChannelsUp(tag=get_new_tag(), node_desc=ls_node_desc, gs_cd=gs_cd, @@ -408,7 +406,8 @@ def multinode(make_infrastructure_resources: bool = True, stdin_req=None, stdout_req=None, stderr_req=None, - layout=None + layout=None, + local_cuids=set() ) except Exception as e: logging.getLogger(dls.LS).getChild('start_ta').fatal(f'transport agent launch failed on {node_index}') @@ -472,7 +471,7 @@ def multinode(make_infrastructure_resources: bool = True, log.info('ls received GSPingSH from gs - m10') # Send response to GS - gs_in_wh.send(dmsg.SHPingGS(tag=get_new_tag(), idx=node_index, node_sdesc=node_sdesc).serialize()) + gs_in_wh.send(dmsg.SHPingGS(tag=get_new_tag(), idx=node_index, node_sdesc=ls_node_desc.sdesc).serialize()) log.info('ls sent SHPingGS - m11') except (OSError, EOFError, json.JSONDecodeError, AssertionError, RuntimeError) as rte: log.fatal('startup failed') diff --git a/src/dragon/localservices/server.py b/src/dragon/localservices/server.py index ebc7229..5cbbb25 100644 --- a/src/dragon/localservices/server.py +++ b/src/dragon/localservices/server.py @@ -9,11 +9,11 @@ import json import collections import selectors -import signal -import socket from .. import channels as dch from .. import managed_memory as dmm +from .. import fli +from ..rc import DragonError from .. import pmod from .. import utils as dutils @@ -24,9 +24,11 @@ from ..infrastructure import connection as dconn from ..infrastructure import parameters as dp + from ..dlogging import util as dlog from ..dlogging.util import DragonLoggingServices as dls -from ..utils import B64 +from ..utils import B64, b64encode, b64decode +from typing import Optional _TAG = 0 _TAG_LOCK = threading.Lock() @@ -41,24 +43,30 @@ def get_new_tag(): ProcessProps = collections.namedtuple('ProcessProps', ['p_uid', 'critical', 'r_c_uid', 'stdin_req', 'stdout_req', 'stderr_req', 'stdin_connector', 'stdout_connector', 'stderr_connector', - 'layout']) + 'layout', 'local_cuids']) class PopenProps(subprocess.Popen): def __init__(self, props: ProcessProps, *args, **kwds): assert isinstance(props, ProcessProps) - super().__init__(*args, **kwds) self.props = props - + # if your kwds are going to have a non-None env then it needs to be edited before the process is initialized + if props.layout is not None: + if props.layout.gpu_core and props.layout.accelerator_env: + # gpu_core list must be turned into a string in the form "0,1,2" etc + if isinstance(kwds["env"], dict): + kwds["env"][props.layout.accelerator_env] = ",".join(str(core) for core in props.layout.gpu_core) + else: + # this might be unnecessary because the environment might always be a dict. + os.environ[props.layout.accelevator_env] = ",".join(str(core) for core in props.layout.gpu_core) + super().__init__(*args, **kwds) # Assuming this is basically a free call, default the afinity to "everything" just in case os.sched_setaffinity(self.pid, range(os.cpu_count())) if props.layout is not None: if props.layout.cpu_core: os.sched_setaffinity(self.pid, props.layout.cpu_core) - # gpu_core list must be turned into a string in the form "0,1,2" etc - if props.layout.gpu_core and props.layout.accelerator_env: - os.environ[props.layout.accelevator_env] = ",".join(str(core) for core in props.layout.gpu_core) + # XXX Affinity settings are only inherited by grandchild processes # XXX created after this point in time. Any grandchild processes @@ -227,6 +235,14 @@ def flush(self): str_data = io_data.decode() + # This is temporary and code to ignore warnings coming from capnproto. The + # capnproto library has been modified to not send the following warning. + # kj/filesystem-disk-unix.c++:1703: warning: PWD environment variable ... + # The warning does not show up normally but does in our build pipeline for + # now until the pycapnp project is updated. So we eliminate it here for now. + if 'kj/' in str_data and ': warning:' in str_data: + return False + if not is_stderr and self._puid == dfacts.GS_PUID: raise TerminationException(str_data) else: @@ -330,6 +346,47 @@ def mk_input_connection_over_channel(ch_desc): return dconn.Connection(inbound_initializer=the_channel, options=dconn.ConnectionOptions(min_block_size=512), policy=dp.POLICY_INFRASTRUCTURE) +class AvailableLocalCUIDS: + """ Internal only class that manages Process Local CUIDs + """ + + # We reserve dfacts.MAX_NODES cuids for the main channel for each local services. + LOCAL_CUID_RANGE = (dfacts.BASE_SHEP_CUID + dfacts.RANGE_SHEP_CUID) - dfacts.MAX_NODES + AVAILABLE_PER_NODE = LOCAL_CUID_RANGE // dfacts.MAX_NODES + def __init__(self, node_index): + if node_index >= dfacts.MAX_NODES: # 0 <= node_index < MAX_NODES + raise RuntimeError(f'A Local Services nodes has {node_index=} which is greater than max allowed.') + + self._node_index = node_index + self._active = set() + self._initial_cuid = dfacts.BASE_SHEP_CUID + dfacts.MAX_NODES + node_index * AvailableLocalCUIDS.AVAILABLE_PER_NODE + self._next_available_cuid = self._initial_cuid + self._last_available = self._initial_cuid + AvailableLocalCUIDS.AVAILABLE_PER_NODE - 1 + + @property + def next(self): + if len(self._active) == AvailableLocalCUIDS.AVAILABLE_PER_NODE: + raise RuntimeError(f'Ran out of Process Local CUIDs. Limit is {AvailableLocalCUIDS.AVAILABLE_PER_NODE}') + + found_free_cuid = False + while not found_free_cuid: + cuid = (self._next_available_cuid - self._initial_cuid) % AvailableLocalCUIDS.AVAILABLE_PER_NODE + self._initial_cuid + if not cuid in self._active: + found_free_cuid = True + self._next_available_cuid += 1 + self._active.add(cuid) + return cuid + + def reclaim(self, cuids): + self._active.difference_update(cuids) + +def send_fli_response(resp_msg, ser_resp_fli): + + resp_fli = fli.FLInterface.attach(b64decode(ser_resp_fli)) + sendh = resp_fli.sendh() + sendh.send_bytes(resp_msg.serialize()) + sendh.close() + resp_fli.detach() class LocalServer: """Handles shepherd messages in normal processing. @@ -355,6 +412,9 @@ def __init__(self, channels=None, pools=None, self.exited_channel_output_monitors = queue.SimpleQueue() self.hostname = hostname self.cuid_to_input_connector = {} + self.node_index = parms.this_process.index + self.local_cuids = AvailableLocalCUIDS(self.node_index) + self.def_muid = dfacts.default_pool_muid_from_index(self.node_index) if channels is None: self.channels = {} # key c_uid value channel @@ -365,6 +425,10 @@ def __init__(self, channels=None, pools=None, else: self.pools = pools + # This is the local services key/value store used for + # bootstrapping on-node code. + self.kvs = {} + self.apt = {} # active process table. key: pid, value PopenProps obj self.puid2pid = {} # key: p_uid, value pid self.apt_lock = threading.Lock() @@ -387,6 +451,13 @@ def _logging_ex_handler(self, args): log.error(f'from {thread.name}:\n{buf.getvalue().decode()}') self._abnormal_termination(f'from {thread.name}:\n{buf.getvalue().decode()}') + def make_local_channel(self): + cuid = self.local_cuids.next + def_pool = self.pools[self.def_muid] + ch = dch.Channel(mem_pool=def_pool, c_uid=cuid) + self.channels[cuid] = ch + return ch + def set_shutdown(self, msg): log = logging.getLogger('shutdown event') self.shutdown_sig.set() @@ -608,10 +679,10 @@ def main_loop(self, shep_rh): if msg_pre is None: continue - if isinstance(msg_pre, str): + if isinstance(msg_pre, str) or isinstance(msg_pre, bytearray): try: msg = dmsg.parse(msg_pre) - except (json.JSONDecodeError, KeyError, NotImplementedError, ValueError) as err: + except (json.JSONDecodeError, KeyError, NotImplementedError, ValueError, TypeError) as err: self._abnormal_termination(f'msg\n{msg_pre}\nfailed parse!\n{err!s}') continue else: @@ -715,7 +786,7 @@ def create_channel(self, msg: dmsg.SHChannelCreate) -> None: if error: log.warning(error) - resp_msg=fail(error) + resp_msg = fail(error) else: self.channels[msg.c_uid] = ch encoded_desc = B64.bytes_to_str(ch.serialize()) @@ -724,6 +795,30 @@ def create_channel(self, msg: dmsg.SHChannelCreate) -> None: return resp_msg + @dutil.route(dmsg.SHCreateProcessLocalChannel, _DTBL) + def create_process_local_channel(self, msg: dmsg.SHCreateProcessLocalChannel) -> None: + log = logging.getLogger('create local channel') + log.info("Received an SHCreateProcessLocalChannel") + + if not msg.puid in self.puid2pid: + resp_msg = dmsg.SHCreateProcessLocalChannelResponse(tag=get_new_tag(), ref=msg.tag, err=DragonError.INVALID_ARGUMENT, errInfo='Cannot create channel for non-existent local process on node.') + send_fli_response(resp_msg, msg.respFLI) + return + + try: + ch = self.make_local_channel() + self.apt[self.puid2pid[msg.puid]].props.local_cuids.add(ch.cuid) + except dch.ChannelError as cex: + error = f'{msg!r} failed: {cex!s}' + resp_msg = dmsg.SHCreateProcessLocalChannelResponse(tag=get_new_tag(), ref=msg.tag, err=DragonError.INVALID_OPERATION, errInfo=error) + send_fli_response(resp_msg, msg.respFLI) + return + + encoded_desc = b64encode(ch.serialize()) + resp_msg = dmsg.SHCreateProcessLocalChannelResponse(tag=get_new_tag(), ref=msg.tag, err=DragonError.SUCCESS, serChannel=encoded_desc) + log.info("Received and Created a channel via SHCreateProcessLocalChannel") + send_fli_response(resp_msg, msg.respFLI) + @dutil.route(dmsg.SHChannelDestroy, _DTBL) def destroy_channel(self, msg: dmsg.SHChannelDestroy) -> None: log = logging.getLogger('destroy channel') @@ -748,8 +843,25 @@ def destroy_channel(self, msg: dmsg.SHChannelDestroy) -> None: return resp_msg + @dutil.route(dmsg.SHMultiProcessCreate, _DTBL) + def create_group(self, msg: dmsg.SHMultiProcessCreate) -> None: + log = logging.getLogger('create_group') + success, fail = mk_response_pairs(dmsg.SHMultiProcessCreateResponse, msg.tag) + + responses = [] + failed = False + for process_create_msg in msg.procs: + response = self.create_process(process_create_msg, msg.pmi_group_info) + responses.append(response) + if response.err == dmsg.SHProcessCreateResponse.Errors.FAIL: + failed = True + + # always return success + resp_msg = success(responses=responses, failed=failed) + return resp_msg + @dutil.route(dmsg.SHProcessCreate, _DTBL) - def create_process(self, msg: dmsg.SHProcessCreate) -> None: + def create_process(self, msg: dmsg.SHProcessCreate, pmi_group_info: Optional[dmsg.PMIGroupInfo] = None) -> None: log = logging.getLogger('create process') success, fail = mk_response_pairs(dmsg.SHProcessCreateResponse, msg.tag) @@ -772,6 +884,12 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: the_env = dict(os.environ) the_env.update(req_env) + # Add in the local services return serialized channel descriptor. + shep_return_ch = self.make_local_channel() + shep_return_cd = b64encode(shep_return_ch.serialize()) + the_env[dfacts.env_name(dfacts.SHEP_RET_CD)] = shep_return_cd + + gs_ret_chan_resp = None stdin_conn = None stdin_resp = None stdout_conn = None @@ -781,6 +899,14 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: stdout_root = False stderr_root = False + if msg.gs_ret_chan_msg is not None: + gs_ret_chan_resp = self.create_channel(msg.gs_ret_chan_msg) + if gs_ret_chan_resp.err != dmsg.SHChannelCreateResponse.Errors.SUCCESS: + resp_msg = fail(f'Failed creating the GS ret channel for new process: {stdin_resp.err_info}') + return resp_msg + desc = gs_ret_chan_resp.desc + the_env[dfacts.ENV_GS_RET_CD] = desc + if msg.stdin_msg is not None: stdin_resp = self.create_channel(msg.stdin_msg) if stdin_resp.err != dmsg.SHChannelCreateResponse.Errors.SUCCESS: @@ -841,12 +967,13 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: if msg.stderr == subprocess.STDOUT: stderr = subprocess.STDOUT - if msg.pmi_info: - log.debug(f'{msg.pmi_info}') + if pmi_group_info and msg.pmi_info: + log.debug('pmi_group_info=%s', str(pmi_group_info)) + log.debug('pmi_process_info=%s', str(msg.pmi_info)) log.info(f'p_uid {msg.t_p_uid} looking up pmod launch cuid') pmod_launch_cuid = dfacts.pmod_launch_cuid_from_jobinfo( dutils.host_id(), - msg.pmi_info.job_id, + pmi_group_info.job_id, msg.pmi_info.lrank ) @@ -863,7 +990,7 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: except KeyError: pass - the_env['PMI_CONTROL_PORT'] = str(msg.pmi_info.control_port) + the_env['PMI_CONTROL_PORT'] = str(pmi_group_info.control_port) the_env['MPICH_OFI_CXI_PID_BASE'] = str(msg.pmi_info.pid_base) the_env['DL_PLUGIN_RESILIENCY'] = "1" the_env['LD_PRELOAD'] = 'libdragon.so' @@ -886,7 +1013,7 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: ProcessProps(p_uid=msg.t_p_uid, critical=False, r_c_uid=msg.r_c_uid, stdin_req=msg.stdin, stdout_req=msg.stdout, stderr_req=msg.stderr, stdin_connector=stdin_connector, stdout_connector=stdout_connector, - stderr_connector=stderr_connector, layout=msg.layout), + stderr_connector=stderr_connector, layout=msg.layout, local_cuids=set([shep_return_ch.cuid])), real_args, bufsize=0, stdin=subprocess.PIPE, @@ -913,11 +1040,11 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: pmod.PMOD( msg.pmi_info.ppn, msg.pmi_info.nid, - msg.pmi_info.nnodes, - msg.pmi_info.nranks, - msg.pmi_info.nidlist, - msg.pmi_info.hostlist, - msg.pmi_info.job_id + pmi_group_info.nnodes, + pmi_group_info.nranks, + pmi_group_info.nidlist, + pmi_group_info.hostlist, + pmi_group_info.job_id ).send_mpi_data(msg.pmi_info.lrank, pmod_launch_ch) log.info(f'p_uid {msg.t_p_uid} DONE: sending mpi data for {msg.pmi_info.lrank}') @@ -931,7 +1058,8 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: proc_stdin_send.send(msg.initial_stdin) log.info('The provided string was written to stdin of the process by local services.') - resp_msg = success(stdin_resp=stdin_resp, stdout_resp=stdout_resp, stderr_resp=stderr_resp) + resp_msg = success(stdin_resp=stdin_resp, stdout_resp=stdout_resp, stderr_resp=stderr_resp, + gs_ret_chan_resp=gs_ret_chan_resp) except (OSError, ValueError) as popen_err: error = f'{msg!r} encountered {popen_err}' log.warning(error) @@ -1011,6 +1139,30 @@ def fwd_input(self, msg: dmsg.SHFwdInput) -> None: return resp_msg + @dutil.route(dmsg.SHSetKV, _DTBL) + def handle_set_kv(self, msg: dmsg.SHSetKV) -> None: + log = logging.getLogger('set key-value') + if msg.value == '': + if msg.key in self.kvs: + del self.kvs[msg.key] + else: + self.kvs[msg.key] = msg.value + resp_msg = dmsg.SHSetKVResponse(tag=get_new_tag(), ref=msg.tag, err=DragonError.SUCCESS) + log.info("Received SHSetKV message and processed it.") + send_fli_response(resp_msg, msg.respFLI) + + @dutil.route(dmsg.SHGetKV, _DTBL) + def handle_get_kv(self, msg: dmsg.SHSetKV) -> None: + log = logging.getLogger('get key-value') + if not msg.key in self.kvs: + resp_msg = dmsg.SHGetKVResponse(tag=get_new_tag(), ref=msg.tag, value='', err=DragonError.NOT_FOUND) + else: + val = self.kvs[msg.key] + resp_msg = dmsg.SHGetKVResponse(tag=get_new_tag(), ref=msg.tag, value=val, err=DragonError.SUCCESS) + log.info("Received SHSetKV message and processed it.") + send_fli_response(resp_msg, msg.respFLI) + + @dutil.route(dmsg.AbnormalTermination, _DTBL) def handle_abnormal_term(self, msg: dmsg.AbnormalTermination) -> None: log = logging.getLogger('abnormal termination') @@ -1102,6 +1254,21 @@ def watch_death(self): self._send_response(target_uid=r_c_uid, msg=resp) log.info(f'transmit {repr(resp)} via _send_response') + # Delete process local channels and reclaim their cuids. + for cuid in proc.props.local_cuids: + try: + self.channels[cuid].destroy() + except Exception as ex: + log.info(f'Could not destroy process local channel on process exit. Error:{repr(ex)}') + + try: + del self.channels[cuid] + except Exception as ex: + log.info(f'Could not remove process local channel on process exit. Error:{repr(ex)}') + + self.local_cuids.reclaim(proc.props.local_cuids) + proc.props.local_cuids.clear() + # If we haven't received SHTeardown yet if proc.props.critical and not self.check_shutdown(): if proc.props.p_uid == dfacts.GS_PUID: diff --git a/src/dragon/managed_memory.pxd b/src/dragon/managed_memory.pxd index 7972988..0f0ba34 100644 --- a/src/dragon/managed_memory.pxd +++ b/src/dragon/managed_memory.pxd @@ -5,6 +5,7 @@ cdef class MemoryPool: cdef dragonMemoryPoolSerial_t _pool_ser cdef dragonMemoryPoolAttr_t _mattr cdef bint _serialized + cdef dragonM_UID_t _muid cdef inline dragonError_t get_pool_ptr(self, dragonMemoryPoolDescr_t * pool): diff --git a/src/dragon/mpbridge/context.py b/src/dragon/mpbridge/context.py index adade63..eb1d9b6 100644 --- a/src/dragon/mpbridge/context.py +++ b/src/dragon/mpbridge/context.py @@ -776,7 +776,6 @@ def Pool(self, processes=None, initializer=None, initargs=(), maxtasksperchild=N initializer=initializer, initargs=initargs, maxtasksperchild=maxtasksperchild, - context=self.get_context(), ) def RawValue(self, typecode_or_type, *args): diff --git a/src/dragon/mpbridge/pool.py b/src/dragon/mpbridge/pool.py index 1c5a3ed..f62c24c 100644 --- a/src/dragon/mpbridge/pool.py +++ b/src/dragon/mpbridge/pool.py @@ -1,10 +1,10 @@ """Dragon's replacement for Multiprocessing Pool. -By default this uses a patched version of the dragon native pool and sets -`DRAGON_BASEPOOL="NATIVE"`. The private api for this class is still under -development. To revert to the version based on the `multiprocessing.Pool` class -with a patched terminate_pool method, set `DRAGON_BASEPOOL="PATCHED"` in the -environment. +By default this uses the dragon native pool and sets +`DRAGON_BASEPOOL="NATIVE"`. The private api for this class is still under +development. To revert to the version based on the `multiprocessing.Pool` class +with a patched terminate_pool method, set `DRAGON_BASEPOOL="PATCHED"` in the +environment. """ import multiprocessing.pool from multiprocessing import get_start_method @@ -127,13 +127,14 @@ class DragonPoolPatched(multiprocessing.pool.Pool): # Dummy def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + class WrappedDragonProcess: # Dummy def __init__(self, process, ident): self._puid = ident if process is None: self._process = Process(None, ident=self._puid) - + def start(self) -> None: """Start the process represented by the underlying process object.""" self._process.start() @@ -144,7 +145,11 @@ def is_alive(self) -> bool: :return: True if the process is running, False otherwise :rtype: bool """ - return self._process.is_alive + try: + stat = self._process.is_alive + except Exception: + stat = False + return stat def join(self, timeout: float = None) -> int: """Wait for the process to finish. @@ -154,7 +159,7 @@ def join(self, timeout: float = None) -> int: :return: exit code of the process, None if timeout occurs :rtype: int :raises: ProcessError - """ + """ return self._process.join() def terminate(self) -> None: @@ -177,7 +182,7 @@ def kill(self) -> None: def pid(self): """Process puid. Globally unique""" return self._puid - + @property def name(self) -> str: """gets serialized descriptors name for the process @@ -186,7 +191,7 @@ def name(self) -> str: :rtype: str """ return self._process.name - + @property def exitcode(self) -> int: """When the process has terminated, return exit code. None otherwise.""" @@ -195,15 +200,15 @@ def exitcode(self) -> int: @property def sentinel(self): raise NotImplementedError - + @property def authkey(self): raise NotImplementedError - + @property def daemon(self): raise NotImplementedError - + @property def close(self): raise NotImplementedError @@ -264,7 +269,7 @@ class DragonPool(NativePool): def __init__(self, *args, context=None, **kwargs): super().__init__(*args, **kwargs) - + @property def _pool(self): puids = self._pg.puids @@ -275,23 +280,22 @@ def _pool(self): # add a wrapped proc that has an interface like what mp is expecting for puid in puids: pool_procs.append(WrappedDragonProcess(None, ident=puid)) - return pool_procs + return pool_procs def _repopulate_pool(self): - # repopulate pool by shutting PG down and then starting new PG + # repopulate pool by shutting PG down and then starting new PG if self._close_thread is not None: raise RuntimeError("Trying to repopulate a pool that was previously closed. This pattern is not supported.") if not self._pg.status == "Stop": self._pg.kill(signal.SIGTERM) self._pg.join() self._pg.stop() - + del self._pg self._pg = ProcessGroup(restart=True, ignore_error_on_exit=True) self._pg.add_process(self._processes, self._template) self._pg.init() self._pg.start() - def apply_async( self, func: callable, diff --git a/src/dragon/native/group_state.py b/src/dragon/native/group_state.py new file mode 100644 index 0000000..fc96982 --- /dev/null +++ b/src/dragon/native/group_state.py @@ -0,0 +1,1085 @@ +import logging +import time +import enum +import signal +import threading +from time import sleep +from typing import List, Tuple +from abc import ABC, abstractmethod + +from .value import Value +from .queue import Queue +from .array import Array +from .lock import Lock + +from ..globalservices.process import ( + multi_join, + get_create_message, + get_create_message_with_argdata, + query as process_query, + get_multi_join_failure_puids, + get_multi_join_success_puids +) +from ..globalservices.group import ( + create, + create_add_to, + remove_from, + destroy, + kill as group_kill, + GroupError) +from ..globalservices.policy_eval import PolicyEvaluator +from ..infrastructure.policy import Policy +from ..channels import Channel +from ..infrastructure.connection import Connection + +LOG = logging.getLogger(__name__) + +# exit code returned by cython for sigterm +# we also mod by 256 for unsigned char repr +CYTHON_SIGTERM_ECODE = -15 + + +class DragonProcessGroupError(Exception): + """Exceptions raised by the Dragon Pool Workers implementation""" + + pass + + +class DragonProcessGroupAbnormalExit(DragonProcessGroupError): + """Exception raised by the Dragon Pool Workers implementation""" + + pass + + +class DragonCriticalProcessFailure(Exception): + """Exception raised by the Dragon Pool Workers implementation""" + + pass + + +@enum.unique +class PGSignals(enum.IntEnum): + ERROR = enum.auto() # for testing purposes + CRITICAL_FAILURE = enum.auto() # for raising exceptions in case of failed processes + NEW = enum.auto() # Start manager but no workers + START = enum.auto() # start all processes/workers + JOIN = enum.auto() # Wait for all the processes to complete + JOIN_SAVE = enum.auto() # Wait for all the processes to complete and save puids at completion + SHUTDOWN = enum.auto() # stop all processes/workers via SIGTERM + SHUTDOWN_SAVE = enum.auto() # stop all processes/workers via SIGTERM and save puids at completion + KILL = enum.auto() # forcefully stop all processes/workers via SIGKILL + KILL_SAVE = enum.auto() # forcefully stop all processes/workers via SIGKILL and save puids at completion + STOP = enum.auto() # kill all Dragon processes and exit the manager + STOP_SAVE = enum.auto() # kill all Dragon processes and exit the manager, but cache inactive puids first + EXIT_LOOP = enum.auto() # tell state runner to exit its loop + RAISE_EXCEPTION = enum.auto() # tell state runner to return the group state status + GROUP_STARTED = enum.auto() # let processgroup loop know the group has been started in maintain + GROUP_KILLED = enum.auto() # let processgroup loop know the group has been destroyed from shutdown + RETURN_TO_IDLE = enum.auto() # let processgroup loop know all group procs are gone and state is back to Idle + REQ_PUIDS = enum.auto() # request state runner populate the puids array with active puids + REQ_PUIDS_RESPONSE = enum.auto() # let process group know the puids array is up-to-date + REQ_INACTIVE_PUIDS = enum.auto() # request state runner populate the inactive puids array with inactive puids + REQ_INACTIVE_PUIDS_RESPONSE = enum.auto() # let process group know the inactive puids array is up-to-date + + +class BaseState(ABC): + """This class declares methods that all concrete State classes should + implement and also provides a backreference to the Context object, + associated with the State. This backreference can be used by States to + transition the Context to another State. + It also defines common methods and data structures to all states. + """ + + forbidden: list[int] = None + gs_req_lock = threading.Lock() + + @property + def state(self): + """Link back to its our parent state object. + :return: The group state holding this state + :rtype: GroupContext + """ + + return self._state + + @state.setter + def state(self, new_state) -> None: + self._state = new_state + + @abstractmethod + def run(self, prior_state, signal: PGSignals, sig_id: int) -> None: + """Execute the run function defined by the parent state""" + pass + + def __str__(self): + return self.__class__.__name__ + + +# concrete states of the process group + + +class Error(BaseState): + """This is the fallback state if an issue with the group occurs. + The state of the processes is undefined here. + """ + + forbidden: list = [s for s in PGSignals if s not in [PGSignals.KILL, PGSignals.STOP]] + + def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: + + try: + LOG.error(f"Process Group {self.state} is in error state.") + except Exception: + pass + + +class CriticalFailure(BaseState): + """This state is triggered when an individual process marked as critical fails + In this state, we kill all processes in the Group and raise an exception + that gets triggered in the Monitor state, so the user sees an exception + raised in their application""" + + forbidden: list = [s for s in PGSignals if s not in [PGSignals.KILL, PGSignals.STOP]] + + def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: + + if self.state.guid: + group_kill(self.state.guid) + self.state.guid = self.state._group_descr = None + + +class Idle(BaseState): + """This state kills existing processes and does nothing otherwise.""" + + forbidden = [PGSignals.KILL] + + def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: + """The idle state just does nothing except making sure all processes are gone.""" + + # Before doing anything make sure any threads are joined on: + try: + prior_state._maintainer_quit.set() + except AttributeError: + pass + + try: + prior_state._maintainer_thread.join() + except (AttributeError, RuntimeError): + pass + + # Now clean up anything else + if self.state.guid: + try: + group_kill(self.state.guid) + except (AttributeError, RuntimeError): + pass + + try: + prior_state._multi_join_runner_thread.join() + except (AttributeError, RuntimeError): + pass + + # See if there are any hanging processes on our end + non_exit_puids = [puid for puid in self.state.local_puids if puid != 0] + if len(non_exit_puids) > 0: + + # Honestly, this should be uncessesary work for GS. We should figure out a better way + proc_statuses = [] + for puid in non_exit_puids: + + # check the status of this process by asking GS + not_exited = True + while not_exited: + proc_desc = process_query(puid) + if proc_desc.ecode: + not_exited = False + proc_statuses.append((puid, proc_desc.ecode)) + else: + # don't overwhelm GS with requests + sleep(0.01) + self.state._update_inactive_puids(proc_statuses) + + # Drop the global services state + self.state.guid = self.state._group_descr = None + + self._start_time = None + + +def complete_exit(puid_codes: List[Tuple[int, int]], + state: BaseState, + pgsignal: PGSignals, + sig_id: int, + conn_out: Connection): + """Register exiting of all processes in the ProcessGroup + + One of the last steps before ending execution of a ProcessGroup. It updates + puids, both active and inactive, zeros out the Group ID/descriptor, and + transitions the state to Idle + + :param puid_codes: List of the last tuples of puids and exit code as reported by Global Services + :type puid_codes: List[Tuple[int, int]] + :param state: The current state the ProcessGroup is in + :type state: BaseState + :param pgsignal: The signal we are told to update based on. From the user. + :type pgsignal: PGSignals + :param sig_id: signal that tells the user code we have completed the requested state + :type sig_id: int + :param conn_out: Connection to use to transmit a RETURN_TO_IDLE state + :type conn_out: Connection + """ + + # catch bad ecodes + ignore_err = state.ignore_error_on_exit + if (pgsignal == PGSignals.STOP): + ignore_err = True + + # Since this is running in its own thread, it needs to update the + # status if needed + raise_error = False + if not ignore_err: + for puid, ecode in puid_codes: + if ecode not in {0, CYTHON_SIGTERM_ECODE, CYTHON_SIGTERM_ECODE % 256}: + LOG.debug(f"Bad exit code {ecode} for puid {puid} in ProcessGroup") + error_to_raise = DragonProcessGroupAbnormalExit(f"Process {puid} exited abnormally with {ecode}") + raise_error = True + + # Update the exit code + state._update_inactive_puids(puid_codes) + + if state._save_puids: + state._update_puids_array(PGSignals.REQ_PUIDS) + state._update_puids_array(PGSignals.REQ_INACTIVE_PUIDS) + + # If we ended up here, we've joined on all members of the group and that particular descriptor + # and guid is now dead. Querying on it can deadlock gs_request. + state.guid = state._group_descr = None + + # raise exception to parent thread + if raise_error: + state.man_out.send((PGSignals.RAISE_EXCEPTION, error_to_raise)) + # Else move on to error and exit this state + else: + # Tell anyone waiting for we're done + conn_out.send((PGSignals.RETURN_TO_IDLE, sig_id)) + prior_state = state.transition_to(Idle) + state.run(prior_state, pgsignal, sig_id) + state.update_status(None) + + +class Maintain(BaseState): + """This state starts missing processes and restarts processes that are not + alive anymore. + :raises DragonProcessGroupError: If one of the processes could not be (re)started. + """ + + forbidden = [PGSignals.NEW, PGSignals.START] + _maintainer_thread = None + _maintainer_quit = None + + def run(self, prior_state: BaseState, pgsignal: PGSignals, sig_id: int) -> None: + + nretries = self.state.num_restart_retries + + if self.state._group_descr is None: + self.state._start_group_once() + self.state.man_out.send((PGSignals.GROUP_STARTED, sig_id)) + self.state.update_status(sig_id) + + def _restart_processes(puids: List[int], + guid: int, + quitter_event: threading.Event) -> bool: + """Remove a list of puids from the GS ProcessGroup, restart a corresponding number, and define a new GroupDescriptor + + :param puids: List of puids to remove from Global Services' monitoring (because they've exited) + :type puids: List[int] + :param guid: The group descriptor guid these processes belong to + :type guid: int + :param quitter_event: threading event that will tell parent thread to exit this function early. + :type quitter_event: threading.Event + :returns: Whether the calling fucnction should break from its while loop due to the quitter_event being set + :rtype: bool + """ + group_descr = None + break_loop = False + + for puid, _ in puids: + # Before submitting a GS request, check that we haven't been told to quit + if quitter_event.is_set(): + break_loop = True + break + try: + group_descr = remove_from(guid, [puid]) + # Group may have been destroyed and we just haven't gotten back to the top of our loop yet + except (GroupError, TypeError): + if quitter_event.is_set(): + break_loop = True + break + + # create a new one + msg = self.state.messages[self.state.puid_to_message_map[puid]] + nrestarts = 0 + + while nrestarts < nretries: + try: + # Before submitting a GS request, check that we haven't been told to quit + if quitter_event.is_set(): + break_loop = True + break + group_descr = create_add_to(guid, [(1, msg.serialize())], self.state.policy) + # update the puid_to_message_map dict with the newly added process + puids = [descr.uid for lst in group_descr.sets for descr in lst] + for new_puid in puids: + if new_puid not in self.state.puid_to_message_map: + self.state.puid_to_message_map[new_puid] = self.state.puid_to_message_map[ + puid + ] + # since we added only one process, we can safely assume that we found it + break + break + except Exception: + nrestarts += 1 + + # we need to update the group descriptor after all the above additions/removals + if group_descr is not None: + guid = group_descr.g_uid + self.state._group_descr = group_descr + self.state.guid = group_descr.g_uid + self.state._update_active_puids([descr.uid for lst in group_descr.sets for descr in lst]) + if break_loop: + break + + if nrestarts == nretries: + self.state.man_out.send((PGSignals.RAISE_EXCEPTION, + DragonProcessGroupError(f"Unable to start process using message {msg}."))) + + return break_loop + + def _maintain_runner(sig_id: int, + quitter_event: threading.Event): + """Maintain thread function that monitors processes and restarts them as they exit until told to stop + + :param sig_id: Signal that tells used to tell us user has control of its main thread again + :type sig_id: int + :param quitter_event: Event telling us to exit and allow transition to a new state + :type quitter_event: threading.Event + """ + + break_loop = False + # Wait until we've updated the status to start our maintain loop + while self.state.last_completed_signal != sig_id: + sleep(0.01) + + # Enter a loop monitoring the processes in our group and restart if they go down + while not break_loop: + + # Get the latest guid just in case it's been changed + guid = self.state.guid + + # The event checking may seem excessive but we want to avoid entering a + # GS request via multi_join if at all possible. + if quitter_event.is_set(): + break + puids = [descr.uid for lst in self.state._group_descr.sets for descr in lst] + if puids: + ready = multi_join(puids, join_all=False, timeout=0.3, return_on_bad_exit=True) + # Make sure we weren't told to exit by the user. + if quitter_event.is_set(): + break + + if ready[0]: + self.state._update_inactive_puids(ready[0]) + + # Check if there were any non-zero exits + bad_exits, _ = get_multi_join_failure_puids(ready[1]) + clean_exits, _ = get_multi_join_success_puids(ready[1]) + + # If we returned from the join via a bad exit, restart those processes + if bad_exits: + break_loop = _restart_processes(bad_exits, guid, quitter_event) + # Otherwise, we can exit since we have no bad exits and all processes have exited and we (ready[0] condition) + elif clean_exits: + break_loop = _restart_processes(clean_exits, guid, quitter_event) + + # Start a thread that runs a loop over joins and restarts for our Pool implementation. + # Setting this loop in a thread allows us to return control to the thread listening for + # state changes from the manager + self._maintainer_quit = threading.Event() + self._maintainer_thread = threading.Thread(name="maintainer thread", + target=_maintain_runner, + args=(sig_id, self._maintainer_quit), + daemon=False) + self._maintainer_thread.start() + + +class Running(BaseState): + """State for running ProcessGroup with no restart, or to handle group kills from earlier states""" + + # user needs to wait for group to become IDLE + forbidden = [ + s for s in PGSignals if s not in [PGSignals.ERROR, + PGSignals.KILL, PGSignals.KILL_SAVE, + PGSignals.SHUTDOWN, PGSignals.SHUTDOWN_SAVE, + PGSignals.JOIN, PGSignals.JOIN_SAVE] + ] + + _multi_join_runner_thread = None + + def run(self, prior_state: BaseState, pgsignal: PGSignals, sig_id: int) -> None: + + # In case the maintainer thread is in action, make sure it's gone before we try doing + # anything else. This limits stress on global services and thread contention + try: + prior_state._maintainer_quit.set() + except AttributeError: + pass + + try: + prior_state._maintainer_thread.join() + except (AttributeError, RuntimeError): + pass + + # If we ended up here, the user may is requesting a shutdown rather than for us to run thing. + # I'm unsure why this is needed inside Running, but I'm following the precedent given me + if pgsignal == PGSignals.SHUTDOWN and self.state.guid is not None: # have processes exit + group_kill(self.state.guid, sig=signal.SIGTERM) + self.state.man_out.send((PGSignals.GROUP_KILLED, sig_id)) + + # NOTE: don't set guid to None here. Later transitions will do that when appropriate. + + # Make sure we join on everything + if not self._multi_join_runner_thread: + # this is more complicated as it needs to be, because we're using + # process objects and multi_join wants puids. + if prior_state == Idle: # if we started with restart == False from Idle + self.state._start_group_once() + self.state.man_out.send((PGSignals.GROUP_STARTED, sig_id)) + self.state.update_status(sig_id) + + # Set up a thread to run the blocking multi_join on + if not self._multi_join_runner_thread: + + def _multi_join_runner(puids: List[int], + critical: bool, + pgsignal: PGSignals, + sig_id: int): + """Function run by Running thread. Joins on a list of puids and exits when they've exited + + :param puids: List of puids to monitor state of + :type puids: List[int] + :param critical: Whether the loss of any invididual puid should be treated as critical via raising an error + :type critical: bool + :param pgsignal: The signal the user provided to bring us to this state + :type pgsignal: PGSignals + :param sig_id: The signal int that we were given upon transition to this state + :type sig_id: int + """ + + ready = multi_join(puids, join_all=True, timeout=None, return_on_bad_exit=critical) + if ready[0] is not None and not self.state.critical.value: # no timeout + complete_exit(ready[0], self.state, pgsignal, sig_id, self.state.man_out) + + # if we were told to treat each process in the group as a critical process, we want to + # give the user info about process-by-process proc status. + elif self.state.critical.value: + + # If multi-join told us everyone is done, let's get out of here: + if ready[0]: + complete_exit(ready[0], self.state, pgsignal, sig_id, self.state.man_out) + else: + bad_puids, _ = get_multi_join_failure_puids(ready[1]) + if bad_puids: + prior_state = self.state.transition_to(CriticalFailure) + self.state.run(prior_state, pgsignal, sig_id) + self.state.update_status(None) + + self._multi_join_runner_thread = threading.Thread(name="multi_join runner thread", + target=_multi_join_runner, + args=(self.state.local_puids, self.state.critical.value, pgsignal, sig_id), + daemon=False) + self._multi_join_runner_thread.start() + + # If we didn't go into the above section, we still need to transition to Idle. The + # logic in the above if block will handle the transition otherwise. + else: + self.state.man_out.send((PGSignals.RETURN_TO_IDLE, sig_id)) + prior_state = self.state.transition_to(Idle) + self.state.run(prior_state, pgsignal, sig_id) + self.state.update_status(None) + + +class Stop(BaseState): + """Stops all processes of the group and removes the group from the Manager. The + group cannot be restarted anymore. + """ + + forbidden = [s for s in PGSignals] # end of line + + def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: + + # Make sure we exit the maintain thread if it's up + try: + prior_state._maintainer_quit.set() + except AttributeError: + pass + + if self.state.guid is not None: + resp = destroy(self.state.guid) + ready = [(desc.desc.p_uid, desc.desc.ecode) for lst in resp.sets for desc in lst] + complete_exit(ready, self.state, signal, sig_id, self.state.man_out) + + if self.state._save_puids: + self.state._update_puids_array(PGSignals.REQ_PUIDS) + self.state._update_puids_array(PGSignals.REQ_INACTIVE_PUIDS) + + # And join on that thread + try: + prior_state._maintainer_thread.join() + except (AttributeError, RuntimeError): + pass + + # And join on that thread + try: + prior_state._maintainer_thread.join() + except (AttributeError, RuntimeError): + pass + + self.state.guid = self.state._group_descr = None + +# end concrete state classes + + +class ProcessGroupState: + """The Context defines the group interface for the manager and the client. + In particular, it handles signals and state changes. It maintains a + reference to an instance of a State subclass, which represents the current + state of the group of processes. + """ + + _state: BaseState = None + + update_interval_sec: float = 0.5 + num_kill_retries: int = 2 + num_restart_retries: int = 10000 + # why have more than 1 target status ? + # because JOIN transitions to Join, and then auto-transitions to Idle. + # so the target state for the manager is Join, but the client has to + # wait for Signal completion on Join and Idle to not introduce a race condition. + # The same is true for SHUTDOWN, which is JOIN with a SIGTERM. + + target_states: dict = { + PGSignals.ERROR: [Stop], + PGSignals.NEW: [Idle], + PGSignals.START: [Maintain], + PGSignals.JOIN: [Running, Idle], + PGSignals.SHUTDOWN: [Running, Idle], + PGSignals.KILL: [Idle], + PGSignals.STOP: [Stop], + } + + forbidden: dict = { + str(Error()): Error.forbidden, + str(Idle()): Idle.forbidden, + str(Maintain()): Maintain.forbidden, + str(Running()): Running.forbidden, + str(Stop()): Stop.forbidden, + str(CriticalFailure()): CriticalFailure.forbidden + } + + avail_states: dict = { + str(Error()): 0, + str(Idle()): 1, + str(Maintain()): 2, + str(Running()): 3, + str(Stop()): 4, + str(CriticalFailure()): 5 + } + states_keys = list(avail_states.keys()) + states_vals = list(avail_states.values()) + + def __init__( + self, + templates: list[tuple], + nproc: int, + restart: bool, + ignore_error_on_exit: bool, + pmi_enabled: bool, # MPI jobs + walltime: float, + policy: Policy, + ) -> None: + """This class represents a group of processes and exposes an interface + to the Manager to handle signals and state transitions. + + :param templates: a list of tuples where each tuple contains a replication factor `n` and a Dragon ProcessTemplate object specifing the properties of the process to start. The processes can hold the same or different attributes in any way and will be numbered in order. + :type templates: list[tuple(int, dragon.native.process.ProcessTemplate),] + :param nproc: total number of processes that belong to the ProcessGroup + :type nproc: int + :param restart: wether to restart processes that exited prematurely, defaults to True + :type restart: bool + :param ignore_error_on_exit: whether to ignore errors when the group exists, defaults to False + :type ignore_error_on_exit: bool + :param pmi_enabled: wether to instruct Dragon to enable MPI support, defaults to False + :type pmi_enabled: bool + :param walltime: time in seconds to run processes before killing them. + :type walltime: float + :param policy: determines the placement of the group resources + :type policy: dragon.infrastructure.policy.Policy + """ + + self.nproc = nproc + self.templates = templates # list of tuples + self.messages = {} # keys are the indices of tuples in self.templates + + # use a dict to make restarting easy and order-safe + self.puid_to_message_map = {} # keys are the puids, values are the keys in self.messages{} + + for i, tup in enumerate(templates): + t = tup[1] + if t.is_python: + self.messages[i] = get_create_message_with_argdata( + t.target, + t.cwd, + t.args, + t.env, + t.argdata, + pmi_required=pmi_enabled, + stdin=t.stdin, + stdout=t.stdout, + stderr=t.stderr, + policy=t.policy, + ) + else: + self.messages[i] = get_create_message( + t.target, + t.cwd, + t.args, + t.env, + pmi_required=pmi_enabled, + stdin=t.stdin, + stdout=t.stdout, + stderr=t.stderr, + policy=t.policy, + ) + + self.guid = None + self._group_descr = None + + self.restart = restart + self.ignore_error_on_exit = ignore_error_on_exit + self.pmi_enabled = pmi_enabled + self.walltime = walltime + if policy: + self.policy = policy + else: + self.policy = Policy() + + self._start_time = 0 + self._state = None + + # count signals so we can report completion + self._signal_counter = Value("i", value=0) + + # (State enum, join status, puid list, puid exited, exit code if exited, last_completed_signal) + self._status_lock = Lock() + self._status = (Value('i', value=-1), + Array('l', [-1] * self.nproc), + Value('i', value=-1)) + self._puids_initd = False + self.local_puids = [] + + # I need this to be able to grow to n > self.nproc, so I have to employ this hack + self._save_puids = False + self.local_inactives = [] + self._inactive_puids = Queue() + self._inactive_puids.put([(0, 0)]) + + self.critical = Value("b", value=False) + + self.transition_to(Idle) + self.update_status(None) + + def start_state_runner(self, + man_ch_in_sdesc: str, + man_ch_out_sdesc: str): + """Start the ProcessGroupState thread that listens for messages from Manager + + :param man_ch_sdesc_in: Channel for receiving messages from Manager + :type man_ch_sdesc_in: str + :param man_ch_sdesc_out: Channel for sending messages to Manager + :type man_ch_sdesc_out: str + """ + + self._state_runner_thread = threading.Thread(name="GroupContext State Runner Thread", + target=self._state_runner, + args=(man_ch_in_sdesc, man_ch_out_sdesc), + daemon=False) + self._state_runner_thread.start() + + def stop_state_runner(self): + """Join on the ProcessGroupState runner thread""" + + self._state_runner_thread.join() + + def handle_signal(self, signal: PGSignals) -> Tuple[BaseState, BaseState, PGSignals]: + """This method takes a signal, checks if the signal is allowed, given + the current state, and returns the new state. + + :param signal: The signal to consider + :type signal: PGSignals + :returns: target state, current state, modified signal + :rtype: {Tuple[BaseState, BaseState, PGSignals]} + """ + + # This messages is to tell us to PGSignals.STOP but to save the inactives before exiting. + # So update that attribute and follow the logic for STOP + if signal in [PGSignals.STOP_SAVE, PGSignals.JOIN_SAVE, PGSignals.KILL_SAVE, PGSignals.SHUTDOWN_SAVE]: + self._save_puids = True + if signal == PGSignals.STOP_SAVE: + signal = PGSignals.STOP + elif signal == PGSignals.JOIN_SAVE: + signal = PGSignals.JOIN + elif signal == PGSignals.KILL_SAVE: + signal = PGSignals.KILL + elif signal == PGSignals.SHUTDOWN_SAVE: + signal = PGSignals.SHUTDOWN + + if signal in self._state.forbidden: + LOG.error(f"Signal not accepted {signal} for state {self._state} !") + return + + if signal: + next_state = self.target_states[signal][0] + else: + next_state = self.current_state + + return next_state, self.current_state, signal + + def transition_to(self, new_state: BaseState) -> BaseState: + """Transition the state to a new state. This does NOT run + the new state or updates the status. + + :param new_state: state to transition to + :type new_state: BaseState + :return: previous state + :rtype: BaseState + """ + + prior_state = self._state + + # If we're in maintain mode, we need tell the maintainer thread to stop + if self.status == str(Maintain()): + new_state._maintainer_quit = self._state._maintainer_quit + + # Proceed with the transition + self._state = new_state + self._state.state = self + + return prior_state + + @property + def current_state(self) -> BaseState: + """Return the current state object. + + :return: the state + :rtype: BaseState + """ + + return self._state + + @property + def prior_state(self) -> BaseState: + """Return the current state object. + + :return: the state + :rtype: BaseState + """ + + return self._prior_state + + def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int): + """Execute the current (!) state in self._state. + + :param prior_state: prior state + :type prior_state: BaseState + """ + + self._state.run(self._state, prior_state, signal, sig_id) + + def transition_to_error(self, prior_state, sig_id, e): + LOG.error(f"Exception in transition {prior_state} -> {self._state}: {e} ") + self.transition_to(Error) + self.update_status(None) + self.run(self._state, None, sig_id) + + def _get_response(self, input_signal: PGSignals) -> PGSignals: + """Return a signal analogously matching the input signal + + :param input_signal: The signal we're requesting the corresponding signal to + :type input_signal: PGSignals + :returns: signal corresponding to the input + :rtype: {PGSignals} + """ + + if input_signal == PGSignals.REQ_PUIDS: + return PGSignals.REQ_PUIDS_RESPONSE + elif input_signal == PGSignals.REQ_INACTIVE_PUIDS: + return PGSignals.REQ_INACTIVE_PUIDS_RESPONSE + + def _state_runner(self, man_ch_sdesc_in: str, man_ch_sdesc_out: str): + """Function run by ProcessGroupState thread to respond to Manager requests + + :param man_ch_sdesc_in: Channel for receiving messages from Manager + :type man_ch_sdesc_in: str + :param man_ch_sdesc_out: Channel for sending messages to Manager + :type man_ch_sdesc_out: str + """ + + man_ch_in = Channel.attach(man_ch_sdesc_in) + man_ch_out = Channel.attach(man_ch_sdesc_out) + + man_inout = Connection(inbound_initializer=man_ch_in, + outbound_initializer=man_ch_out) + self.man_out = Connection(outbound_initializer=man_ch_in) + + running = True + while running: + + # Provide a way to handle a user requested timeout + if self.walltime and self._start_time > 0: + timeout = self.walltime - (time.monotonic() - self._start_time) + if man_inout.poll(timeout=timeout): + signal, sig_id = man_inout.recv() + else: + signal = PGSignals.KILL # goodbye + sig_id = self.last_completed_signal + + # Make sure we don't go crazy in a loop + self._start_time = 0 + # Otherwise, just do a better performing blocking recv. + else: + signal, sig_id = man_inout.recv() + + # The Manager may have gotten the message. If not, we need to forward it + if signal in [PGSignals.GROUP_STARTED, PGSignals.GROUP_KILLED, PGSignals.RETURN_TO_IDLE]: + man_inout.send((signal, sig_id)) + continue + + if signal in [PGSignals.REQ_PUIDS, PGSignals.REQ_INACTIVE_PUIDS]: + # Make sure the puids array is up-to-date with our local info + self._update_puids_array(signal) + man_inout.send((self._get_response(signal), sig_id)) + continue + + if signal in [PGSignals.STOP, PGSignals.STOP_SAVE]: + running = False + if signal == PGSignals.EXIT_LOOP: + break + # The thread in Running hit an exception + elif signal == PGSignals.RAISE_EXCEPTION: + self.transition_to_error(self.current_state, None, sig_id) + continue + + try: + new_state, prior_state, signal = self.handle_signal(signal) + self.transition_to(new_state) + self.run(prior_state, signal, sig_id) + self.update_status(sig_id) + except Exception as e: # automagically transition to Error if something goes wrong + self.transition_to_error(prior_state, sig_id, e) + break + + # Tell the manager we're out + man_inout.send((None, None)) + + def get_next_sig_id(self) -> int: + """Obtain the next unique signal ID for this group state + + :return: next signal id + :rtype: int + """ + + # we need to communicate if a signal completed, so we need an ID for it. + + sig_id = self._signal_counter.value + self._signal_counter.value = 1 + self._signal_counter.value + + return sig_id + + @property + def status(self) -> str: + """The current status of the Group. + + :return: state name + :rtype: str + """ + with self._status_lock: + state_val, _, __ = self._status + val = state_val.value + + if val >= 0: + return self.states_keys[self.states_vals.index(val)] + else: + return str(None) + + @property + def puids(self) -> List[int]: + """The puids currently active and maintained by the GroupContext. + + :return: A list of puids. If puids haven't started, an empty list is returned + :rtype: List[int] + """ + with self._status_lock: + _, puids, __, = self._status + val = [puid for puid in puids if puid != 0] + return val + + @property + def active_puids(self) -> List[int]: + """The puids currently active and maintained by the GroupContext. + + :return: A list of puids if the process is started or None if not. + :rtype: List[int] + """ + + return self.puids + + @property + def inactive_puids(self) -> List[Tuple[int, int]]: + """puids that have exited processes + + :return: A list of puids if the process is started or None if not. + :rtype: list[int or None] + """ + + i_puids = self._inactive_puids.get(timeout=None) + self._inactive_puids.put(i_puids) + return i_puids + + @property + def last_completed_signal(self) -> int: + """Return the ID of the last successfully completed signal. + + :return: the signal id + :rtype: int + """ + with self._status_lock: + _, __, compl_sig_id = self._status + sig_id = compl_sig_id.value + + return sig_id + + def _update_puids_array(self, signal: PGSignals): + """Make sure the puids array matches in locally cached one, accessibly by ProcessGroup object + + :param signal: signal asking for Active or Inactive PUIDS + :type signal: PGSignals + """ + + if signal == PGSignals.REQ_PUIDS: + with self._status_lock: + _, puids, __, = self._status + for idx, local_puid in enumerate(self.local_puids): + puids[idx] = local_puid + elif signal == PGSignals.REQ_INACTIVE_PUIDS: + _ = self._inactive_puids.get(timeout=None) + self._inactive_puids.put(self.local_inactives) + + def _start_group_once(self): + """Start all processes in the ProcessGroup. Only done once""" + + if not self._start_time: + self._start_time = time.monotonic() + + # piecemeal merging of policies. + # first merge group policy and global policy with group policy > global policy. + # this merged policy is saved and is what will be used if in the maintain state if we need to restart processes + if self.policy is not None: + self.policy = PolicyEvaluator.merge(Policy.global_policy(), self.policy) + else: + self.policy = Policy.global_policy() + + # then go through process policies and merge process policies with group+global policy with process policy > group+global policy. + policy_list = [] + for i, tup in enumerate(self.templates): + # we check if we the template has a policy + if self.messages[i].policy is None: + policy_list.extend([self.policy] * tup[0]) + else: + # if the template process got a policy then we give it higher priority, than the policy of the process group. + merged_policy = PolicyEvaluator.merge(self.policy, self.messages[i].policy) + policy_list.extend([merged_policy] * tup[0]) + + group_descr = create( + [(tup[0], self.messages[i].serialize()) for i, tup in enumerate(self.templates)], policy_list + ) + + self._group_descr = group_descr + self.guid = group_descr.g_uid + + # construct the puid_to_message_map dict now that the processes are created + puids = [descr.uid for lst in group_descr.sets for descr in lst] + puid_idx = 0 + for i, tup in enumerate(self.templates): + for _ in range(tup[0]): + self.puid_to_message_map[puids[puid_idx]] = i + puid_idx += 1 + + def _update_inactive_puids(self, exit_statuses: List[Tuple[int, int]]): + """Update the exit status of all processes inside of the state + + Takes results from GlobalServices' multi-join to update our internal state + :param exit_statuses: puids paired with their exit codes + :type exit_statuses: List[Tuple[int, int]] + """ + self.local_inactives += exit_statuses + + # Remove these dead puids from the active list + for l_puid, _ in exit_statuses: + try: + s_idx = [i for i, puid in enumerate(self.local_puids) if puid == l_puid] + # It's possible we already removed this puid earlier + if len(s_idx) == 1: + self.local_puids[s_idx[0]] = 0 + except (IndexError, AssertionError): + raise + + def _update_active_puids(self, puids: List[int]): + """Update the list of active puids + + Takes results from GlobalServices' multi-join to update our internal state + :param exit_statuses: puids paired with their exit codes + :type exit_statuses: List[Tuple[int, int]] + """ + + # find the current index: + for l_puid in puids: + if l_puid not in self.local_puids: + try: + idx = [idx for idx, o_puid in enumerate(self.local_puids) if o_puid == 0][0] + self.local_puids[idx] = l_puid + except Exception: + pass + + # I need to report the last completed signal ID here, so that the + # corresponding client can be be sure his request has been completed. + def update_status(self, compl_sig_id: int): + """update the global status of this state. + + :param compl_sig_id: signal just completed. If None, reuse the last value from the queue. + :type compl_sig_id: int + """ + with self._status_lock: + state_name, last_puids, last_sig_id = self._status + state_name.value = self.avail_states[str(self._state.__name__)] + + if compl_sig_id is None: # this transition was automatic + compl_sig_id = last_sig_id.value + else: + last_sig_id.value = compl_sig_id + + if self._group_descr and not self._puids_initd: + self.local_puids = [descr.uid for lst in self._group_descr.sets for descr in lst] + self._puids_initd = True + + def _make_critical(self): + + self.critical.value = True diff --git a/src/dragon/native/lock.py b/src/dragon/native/lock.py index c22b147..44a022a 100644 --- a/src/dragon/native/lock.py +++ b/src/dragon/native/lock.py @@ -147,7 +147,9 @@ def acquire(self, block: bool = True, timeout: float = None) -> bool: :rtype: bool """ - LOGGER.debug(f"Acquire Lock {self!r} with blocking={block}, timeout={timeout}") + # Locks are often acquired and released many times, we don't log these to prevent + # big impacts on performance. + #LOGGER.debug(f"Acquire Lock {self!r} with blocking={block}, timeout={timeout}") if timeout is not None and timeout < 0: timeout = 0 @@ -184,7 +186,9 @@ def release(self) -> None: :raises AssertionError: if the lock is recursive and not held by the caller """ - LOGGER.debug(f"Release Lock {self!r}") + # Locks are often acquired and released many times, we don't log these to prevent + # big impacts on performance. + #LOGGER.debug(f"Release Lock {self!r}") if self._recursive: assert self._accesscount > 0, "attempt to release recursive lock not owned by process" diff --git a/src/dragon/native/machine.py b/src/dragon/native/machine.py index e8d8239..c039887 100644 --- a/src/dragon/native/machine.py +++ b/src/dragon/native/machine.py @@ -7,6 +7,7 @@ import logging from ..globalservices.node import query, query_total_cpus, get_list +from ..infrastructure.gpu_desc import AccVendor from ..infrastructure.parameters import this_process from ..utils import host_id @@ -73,6 +74,17 @@ def num_cpus(self) -> int: :rtype: int """ return self._descr.num_cpus + + @property + def num_gpus(self) -> int: + """Return a the number of GPUs on this node + + :return: The number of GPUs + :rtype: list[int] + """ + if self._descr.accelerators is None: + return 0 + return len(self._descr.accelerators.device_list) @property def physical_mem(self) -> int: @@ -82,6 +94,45 @@ def physical_mem(self) -> int: :rtype: int """ return self._descr.physical_mem + + @property + def gpus(self) -> list[int]: + """Return a list of GPU visible devices on this node + + :return: list of GPU visible devices + :rtype: list[int] + """ + if self._descr.accelerators is None: + return None + return self._descr.accelerators.device_list + + @property + def gpu_vendor(self) -> str: + """Return the name of the GPU Vendor on this node + + :return: GPU vendor name + :rtype: str + """ + if self._descr.accelerators is None: + return None + vendor_int = self._descr.accelerators.vendor + if vendor_int == AccVendor.NVIDIA: + return 'Nvidia' + elif vendor_int == AccVendor.AMD: + return 'AMD' + #TODO: Add Intel as a vendor + else: + return 'Unknown Vendor' + + @property + def cpus(self) -> list[int]: + """Return the CPUs available on this node + + :return: list of CPUs + :rtype: list[int] + """ + return self._descr.cpu_devices + @property def hostname(self) -> str: @@ -118,6 +169,6 @@ def __init__(self): def nodes(self): return self.nodes - + def nnodes(self) -> int: - return len(self.nodes) \ No newline at end of file + return len(self.nodes) diff --git a/src/dragon/native/pool.py b/src/dragon/native/pool.py index 068884c..e88098f 100644 --- a/src/dragon/native/pool.py +++ b/src/dragon/native/pool.py @@ -1,4 +1,4 @@ -"""The Dragon native pool manages a pool of child processes. +"""The Dragon native pool manages a pool of child processes. """ from __future__ import annotations @@ -16,8 +16,8 @@ import dragon from .queue import Queue -from .process import TemplateProcess -from .process_group import ProcessGroup +from .process import ProcessTemplate +from .process_group import ProcessGroup, DragonProcessGroupError from .event import Event from .process import current as current_process from .machine import System @@ -280,7 +280,7 @@ def __init__( raise ValueError("Number of processes must be at least 1") # starts a process group with nproc workers - self._template = TemplateProcess( + self._template = ProcessTemplate( self._worker_function, args=(self._inqueue, self._outqueue, self._initializer, self._initargs, self._maxtasksperchild), ) @@ -364,22 +364,36 @@ def join(self) -> None: if self._map_launch_thread is not None: self._map_launch_thread.join() - if not self._pg.status == "Stop": + if self._pg.status != "Stop": self._pg.join() - self._pg.stop() + try: + # This extra step makes sure the infrastructure related to managing the processgroup exits cleanly + self._pg.stop() + except DragonProcessGroupError: + pass self._results_handler.join() @staticmethod def _worker_function(inqueue, outqueue, initializer=None, initargs=(), maxtasks=None): - myp = current_process() # handles shutdown signal - termflag = False + class Value: + def __init__(self, val): + self._val = val + + @property + def val(self): + return self._val + + @val.setter + def val(self, val): + self._val = val + + termflag = Value(False) def handler(signum, frame): LOGGER.debug("_worker_function SIGTERM handler saw signal") - nonlocal termflag - termflag = True + termflag.val = True signal.signal(signal.SIGTERM, handler) @@ -391,7 +405,7 @@ def handler(signum, frame): completed_tasks = 0 - while not termflag and (maxtasks is None or (maxtasks and completed_tasks < maxtasks)): + while not termflag.val and (maxtasks is None or (maxtasks and completed_tasks < maxtasks)): # get work item LOGGER.debug(f"getting work from inqueue on node {socket.gethostname()}") try: @@ -404,8 +418,6 @@ def handler(signum, frame): time.sleep(POLL_FREQUENCY) continue - LOGGER.debug(f"{task} task received by {myp.ident} on {socket.gethostname()}") - job, i, func, args, kwargs = task LOGGER.debug(f"job={job}, i={i}, func={func}, args={args}, kwargs={kwargs}") @@ -427,8 +439,6 @@ def handler(signum, frame): outqueue.put((job, i, (False, wrapped))) completed_tasks += 1 - LOGGER.debug(f"{myp.ident} returning from worker_function") - @classmethod def _handle_results(cls, outqueue, cache, end_event): LOGGER.debug( diff --git a/src/dragon/native/process.py b/src/dragon/native/process.py index faeb222..2d020b0 100644 --- a/src/dragon/native/process.py +++ b/src/dragon/native/process.py @@ -1,5 +1,5 @@ """ The Dragon native process object provides process management across one or -multiple distributed systems. `TemplateProcess` can hold a blueprint for a process +multiple distributed systems. `ProcessTemplate` can hold a blueprint for a process that can be used to generate many similar processes. """ @@ -26,6 +26,8 @@ from ..infrastructure.facts import ARG_IMMEDIATE_LIMIT from ..infrastructure.process_desc import ProcessOptions from ..infrastructure.messages import PIPE as MSG_PIPE, STDOUT as MSG_STDOUT, DEVNULL as MSG_DEVNULL +from ..infrastructure.policy import Policy +from ..globalservices.policy_eval import PolicyEvaluator LOG = logging.getLogger(__file__) @@ -231,7 +233,7 @@ def returncode(self): raise NotImplementedError -class TemplateProcess: +class ProcessTemplate: """This class provides a template for a Dragon process.""" def __init__( @@ -244,7 +246,7 @@ def __init__( stdin: int = None, stdout: int = None, stderr: int = None, - # policy: Policy = DefaultPolicy, + policy: Policy = None, ): """Generic Dragon process template object defining a process based on a binary executable or a Python callable. @@ -260,7 +262,7 @@ def __init__( similar processes like this: ```Python - t = TemplateProcess(myfunc) + t = ProcessTemplate(myfunc) p1 = Process.from_template(t, ident="NewProcess1") p2 = Process.from_template(t, ident="NewProcess2") ``` @@ -291,12 +293,14 @@ def __init__( :type stdout: int, optional :param stderr: Stderr file handling. Valid values are PIPE, STDOUT and None. :type stderr: int, optional + :param policy: determines the placement and resources of the process + :type policy: dragon.infrastructure.policy.Policy """ self.is_python = callable(target) # store only the modified targets. We want to be able to pickle the - # TemplateProcess. The user has to use a method to get the original arguments. + # ProcessTemplate. The user has to use a method to get the original arguments. if self.is_python: self.target, self.args, self.argdata = self._get_python_process_parameters(target, args, kwargs) @@ -312,6 +316,19 @@ def __init__( self.stderr = stderr self.stdin = stdin + self.policy = policy + # can't grab the global policy here because of the following. + # default behavior will be that the global policy is grabbed when + # the process is started. Thus, template processes hold a policy + # that gets merged in later to the hierarchy of policies. + # with Policy(placement=x) + # with Policy(placement=y) + # local_policy = Policy(cpu_affinity=) + # temp_proc = ProcessTemplate(..., policy=local_policy) + # + # pg = ProcessGroup(policy = Policy(placement=z)) + # pg.add_procs(10, temp_proc) <- If I merge the policy into template policy, I don't know if placement=y is from the local policy or if placement=y is from global policy so I can't inject the group policy into the hierarchy correctly. + @staticmethod def _find_target(target, cwd) -> str: @@ -354,7 +371,7 @@ def start(self) -> None: raise NotImplementedError(f"You need to create a Process object from this template to start it.") -class Process(TemplateProcess): +class Process(ProcessTemplate): """This object describes a process managed by the Dragon runtime.""" def __new__(cls, *args, **kwargs): @@ -381,7 +398,7 @@ def __init__( stdin: int = None, stdout: int = None, stderr: int = None, - # policy: Policy = DefaultPolicy, + policy: Policy = None, ): """Generic Dragon process object executing a binary executable or a Python callable. @@ -421,13 +438,15 @@ def __init__( :type stdout: int, optional :param stderr: Stderr file handling. Valid values are PIPE, STDOUT and None. :type stderr: int, optional + :param policy: determines the placement and resources of the process + :type policy: dragon.infrastructure.policy.Policy """ self.started = False self.ident = ident self._pmi_enabled = _pmi_enabled - # strip the name/uid from the parameters, as TemplateProcess cannot have it by definition. + # strip the name/uid from the parameters, as ProcessTemplate cannot have it by definition. if ident: try: self._update_descriptor(ident=ident) @@ -438,16 +457,16 @@ def __init__( LOG.warning(f"Returning named process from infrastructure with valid target arg in call.") return # we're done - return super().__init__(target, args, kwargs, cwd, env, stdin, stdout, stderr) + return super().__init__(target, args, kwargs, cwd, env, stdin, stdout, stderr, policy) @classmethod def from_template( - cls, template: TemplateProcess, ident: str = None, _pmi_enabled: bool = False + cls, template: ProcessTemplate, ident: str = None, _pmi_enabled: bool = False ) -> object: """A classmethod that creates a new process object from a template. :param template: the template to base the process on - :type template: TemplateProcess + :type template: ProcessTemplate :param ident: intended name of the process, defaults to None :type ident: str, optional :return: The new process object @@ -470,8 +489,8 @@ def from_template( else: raise ProcessError(f"A process '{ident}' already exists within the Dragon runtime.") - return cls(target, args, kwargs, template.cwd, template.env, ident=ident, - _pmi_enabled=_pmi_enabled, stdin=template.stdin, stdout=template.stdout, stderr=template.stderr) + return cls(target, args, kwargs, template.cwd, template.env, ident=ident, + _pmi_enabled=_pmi_enabled, stdin=template.stdin, stdout=template.stdout, stderr=template.stderr, policy=template.policy) def start(self) -> None: """Start the process represented by the underlying process object.""" @@ -480,6 +499,11 @@ def start(self) -> None: raise RuntimeError(f"This Process has already been started with puid {self.puid}") options = ProcessOptions(make_inf_channels=True) + if self.policy is not None: + # merge global policy and processes' policy + self.policy = PolicyEvaluator.merge(Policy.global_policy(), self.policy) + else: + self.policy = Policy.global_policy() if self.is_python: @@ -491,6 +515,7 @@ def start(self) -> None: env=self.env, options=options, user_name=self.ident, + policy=self.policy, ) else: # binary @@ -503,7 +528,8 @@ def start(self) -> None: user_name=self.ident, stdin=self.stdin, stdout=self.stdout, - stderr=self.stderr + stderr=self.stderr, + policy=self.policy ) self.started = True diff --git a/src/dragon/native/process_group.py b/src/dragon/native/process_group.py index 2ba0af2..3d1edfa 100644 --- a/src/dragon/native/process_group.py +++ b/src/dragon/native/process_group.py @@ -4,7 +4,7 @@ it only maintains their lifecycle. This file implements a client API class and a Manager process handling all -groups on the node. The manager holds a list of GroupContext classes and gets +groups on the node. The manager holds a list of ProcessGroupState classes and gets signals from the client classes using a queue. The group of processes undergoes state transitions depending on the signals the client send to the manager. @@ -15,653 +15,305 @@ .. image:: ./images/dragon_worker_pool.svg :scale: 75% + """ import logging import time -import enum import signal -from queue import Empty - -from abc import ABC, abstractmethod - -import dragon - -from .process import Process, TemplateProcess -from .queue import Queue -from .value import Value -from .lock import Lock -from .machine import current as current_node - -from ..globalservices.process import ( - multi_join, - kill as process_kill, - get_create_message, - get_create_message_with_argdata, - query as process_query, +import threading +from time import sleep +from typing import List, Tuple + +from .process import Process, ProcessTemplate +from .group_state import ( + ProcessGroupState, + PGSignals, + DragonProcessGroupError, + Maintain, + Idle, + Error ) -from ..globalservices.group import create, kill as group_kill, add_to, create_add_to, remove_from, destroy -from ..infrastructure.policy import Policy -from ..infrastructure.policy import Policy, GS_DEFAULT_POLICY +from ..channels import Channel +from ..infrastructure.facts import default_pool_muid_from_index +from ..globalservices.process import kill as process_kill +from ..globalservices.channel import create +from ..infrastructure.parameters import this_process, Policy +from ..infrastructure.connection import Connection, ConnectionOptions +from ..utils import B64 LOG = logging.getLogger(__name__) -# exit code returned by cython for sigterm -# we also mod by 256 for unsigned char repr -CYTHON_SIGTERM_ECODE = -15 - - -class DragonProcessGroupError(Exception): - """Exceptions raised by the Dragon Pool Workers implementation""" - - pass - - -class DragonProcessGroupAbnormalExit(DragonProcessGroupError): - """Exception raised by the Dragon Pool Workers implementation""" - - pass - # TODO: Extend API to control distribution over multiple nodes. - -@enum.unique -class PGSignals(enum.IntEnum): - ERROR = enum.auto() # for testing purposes - NEW = enum.auto() # Start manager but no workers - START = enum.auto() # start all processes/workers - JOIN = enum.auto() # Wait for all the processes to complete - SHUTDOWN = enum.auto() # stop all processes/workers via SIGTERM - KILL = enum.auto() # forcefully stop all processes/workers via SIGKILL - STOP = enum.auto() # kill all Dragon processes and exit the manager - - -class BaseState(ABC): - """This class declares methods that all concrete State classes should - implement and also provides a backreference to the Context object, - associated with the State. This backreference can be used by States to - transition the Context to another State. - It also defines common methods and data structures to all states. - """ - - forbidden: list[int] = None - - @property - def context(self): - """Link back to it's own context. - - :return: The group context holding this state - :rtype: GroupContext +class Manager: + def __init__(self, + pg_ch_in_sdesc, + pg_ch_out_sdesc): + """The Manager class holds a process that handles the life-cycle of all + process groups on this node. + We handle the group using a ProcessGroupState class that holds all necessary + methods and attributes shared by manager and clients. """ - return self._context - - @context.setter - def context(self, new_context) -> None: - self._context = new_context - - @abstractmethod - def run(self, prior_state, signal: PGSignals, sig_id: int) -> None: - """This method runs the state it is a part of on the context.""" - pass - - def __str__(self): - return self.__class__.__name__ - - -# concrete states of the process group - - -class Error(BaseState): - """This is the fallback state if an issue with the group occurs. - The state of the processes is undefined here. - """ - - forbidden: list = [s for s in PGSignals if s not in [PGSignals.KILL, PGSignals.STOP]] + # missing shared memory implementation, so I am using a Queue + self.group_state = None + + # Create some channels for talking between Manager threads and then the ProcessGroupState + # state thread + self._pg_ch_in_sdesc = pg_ch_in_sdesc + self._pg_ch_out_sdesc = pg_ch_out_sdesc + self._gc_ch_in = create(m_uid=default_pool_muid_from_index(this_process.index)) + self._gc_ch_out = create(m_uid=default_pool_muid_from_index(this_process.index)) + self._state_runner_out = create(m_uid=default_pool_muid_from_index(this_process.index)) + + # create a pipe to make sure I can + + signal_args = (self._pg_ch_in_sdesc, self._pg_ch_out_sdesc, + self._gc_ch_in.sdesc, self._gc_ch_out.sdesc, + self._state_runner_out.sdesc) + self._proc = Process(target=self._signal_handler, + args=signal_args) + self._proc.start() - def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: + def __del__(self): try: - LOG.error(f"Process Group {self.context} is in error state.") + self._proc.join(timeout=self.update_interval_sec) except Exception: pass - time.sleep(self.context.update_interval_sec * 10) - - -class Idle(BaseState): - """This state kills existing processes and does nothing otherwise.""" - - forbidden = [PGSignals.KILL] - - def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: - """The idle state just does nothing except making sure all processes are gone.""" - - if self.context.guid: - group_kill(self.context.guid) - self.context.guid = self.context._group_descr = None - - self._start_time = None - - -class Maintain(BaseState): - """This state starts missing processes and restarts processes that are not - alive anymore. - - :raises DragonProcessGroupError: If one of the processes could not be (re)started. - """ - - forbidden = [PGSignals.NEW, PGSignals.START] - - def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: - - nretries = self.context.num_restart_retries - - if not self.context.guid: - self.context._start_group_once() - - guid = self.context.guid - - group_descr = None - for i, lst in enumerate(self.context._group_descr.sets): - for descr in lst: - puid = descr.uid - # check the status of this process by asking GS - proc_desc = process_query(puid) - if proc_desc.state == descr.desc.State.ACTIVE: - continue - - # remove process from group - group_descr = remove_from(guid, [puid]) - - # create a new one - msg = self.context.messages[self.context.puid_to_message_map[puid]] - nrestarts = 0 - - while nrestarts < nretries: - try: - group_descr = create_add_to(guid, [(1, msg.serialize())], self.context.policy) - # update the puid_to_message_map dict with the newly added process - puids = [descr.uid for lst in group_descr.sets for descr in lst] - for new_puid in puids: - if new_puid not in self.context.puid_to_message_map: - self.context.puid_to_message_map[new_puid] = self.context.puid_to_message_map[ - puid - ] - # since we added only one process, we can safely assume that we found it - break - break - except Exception as e: - nrestarts += 1 - - if nrestarts == nretries: - raise DragonProcessGroupError(f"Unable to start process {i} using message {msg}.") - - # we need to update the group descriptor after all the above additions/removals - if group_descr is not None: - self.context._group_descr = group_descr - -class Running(BaseState): - - # user needs to wait for group to become IDLE - forbidden = [ - s for s in PGSignals if s not in [PGSignals.ERROR, PGSignals.KILL, PGSignals.SHUTDOWN, PGSignals.JOIN] - ] - - def run(self, prior_state: BaseState, pgsignal: PGSignals, sig_id: int) -> None: - - # this is more complicated as it needs to be, because we're using - # process objects and multi_join wants puids. - - timeout = self.context.update_interval_sec - ignore_err = self.context.ignore_error_on_exit - - if prior_state == Idle: # if we started with restart == False from Idle - self.context._start_group_once() - self.context.update_status(sig_id) - - if pgsignal == PGSignals.SHUTDOWN: # have processes exit - group_kill(self.context.guid, sig=signal.SIGTERM) - - # collect puids - puids = self.context.puids - - # join on them - ready = multi_join(puids, join_all=True, timeout=timeout) - - if ready[0] != None: # no timeout - - # catch bad ecodes - if not ignore_err: - for puid, ecode in ready[0]: - if ecode not in {0, CYTHON_SIGTERM_ECODE, CYTHON_SIGTERM_ECODE % 256}: - LOG.debug(f"Bad exit code {ecode} for puid {puid} in ProcessGroup") - raise DragonProcessGroupAbnormalExit("Some processes in group exited abnormally !") - - # move on to Idle - prior_state = self.context.transition_to(Idle) - self.context.run(prior_state, pgsignal, sig_id) - self.context.update_status(None) - -class Stop(BaseState): - """Stops all processes of the group and removes the group from the Manager. The - group cannot be restarted anymore. - """ - - forbidden = [s for s in PGSignals] # end of line - - def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: - - if self.context.guid != None: - destroy(self.context.guid) - self.context.guid = self.context._group_descr = None - - -# end concrete state classes - - -class GroupContext: - """The Context defines the group interface for the manager and the client. - In particular, it handles signals and state changes. It maintains a - reference to an instance of a State subclass, which represents the current - state of the group of processes. - """ - - _state: BaseState = None - - update_interval_sec: float = 0.5 - num_kill_retries: int = 2 - num_restart_retries: int = 2 - - # why have more than 1 target status ? - # because JOIN transitions to Join, and then auto-transitions to Idle. - # so the target state for the manager is Join, but the client has to - # wait for Signal completion on Join and Idle to not introduce a race condition. - # The same is true for SHUTDOWN, which is JOIN with a SIGTERM. - - target_states: dict = { - PGSignals.ERROR: [Stop], - PGSignals.NEW: [Idle], - PGSignals.START: [Maintain], - PGSignals.JOIN: [Running, Idle], - PGSignals.SHUTDOWN: [Running, Idle], - PGSignals.KILL: [Idle], - PGSignals.STOP: [Stop], - } - - forbidden: dict = { - str(Error()): Error.forbidden, - str(Idle()): Idle.forbidden, - str(Maintain()): Maintain.forbidden, - str(Running()): Running.forbidden, - str(Stop()): Stop.forbidden, - } - - def __init__( - self, - templates: list[tuple], - nproc: int, - restart: bool, - ignore_error_on_exit: bool, - pmi_enabled: bool, # MPI jobs - walltime: float, - policy: Policy, - ) -> None: - """This class represents a group of processes and exposes an interface - to the Manager to handle signals and state transitions. - - :param templates: a list of tuples where each tuple contains a replication factor `n` and a Dragon TemplateProcess object specifing the properties of the process to start. The processes can hold the same or different attributes in any way and will be numbered in order. - :type templates: list[tuple(int, dragon.native.process.TemplateProcess),] - :param nproc: total number of processes that belong to the ProcessGroup - :type nproc: int - :param restart: wether to restart processes that exited prematurely, defaults to True - :type restart: bool - :param ignore_error_on_exit: whether to ignore errors when the group exists, defaults to False - :type ignore_error_on_exit: bool - :param pmi_enabled: wether to instruct Dragon to enable MPI support, defaults to False - :type pmi_enabled: bool - :param walltime: time in seconds to run processes before killing them. - :type walltime: float - :param policy: determines the placement of the group resources - :type policy: dragon.infrastructure.policy.Policy - """ - - self.nproc = nproc - self.templates = templates # list of tuples - self.messages = {} # keys are the indices of tuples in self.templates - - # use a dict to make restarting easy and order-safe - self.puid_to_message_map = {} # keys are the puids, values are the keys in self.messages{} - - for i, tup in enumerate(templates): - t = tup[1] - if t.is_python: - self.messages[i] = get_create_message_with_argdata( - t.target, - t.cwd, - t.args, - t.env, - t.argdata, - pmi_required=pmi_enabled, - stdin=t.stdin, - stdout=t.stdout, - stderr=t.stderr, - ) - else: - self.messages[i] = get_create_message( - t.target, - t.cwd, - t.args, - t.env, - pmi_required=pmi_enabled, - stdin=t.stdin, - stdout=t.stdout, - stderr=t.stderr, - ) - - self.guid = None - self._group_descr = None - - self.restart = restart - self.ignore_error_on_exit = ignore_error_on_exit - self.pmi_enabled = pmi_enabled - self.walltime = walltime - if policy: - self.policy = policy - else: - self.policy = Policy() - - self._start_time = 0 - - self._state = None - - # count signals so we can report completion - self._signal_counter = Value("i", value=0) - self._signal_counter_lock = Lock() - - self._status_queue = Queue() - self._status_queue.put((-1, -1, -1)) - - self.transition_to(Idle) - self.update_status(None) - - def __del__(self): - + # make manager exit try: - self._status_queue.put(True) + self._proc.kill() # please go away except Exception: pass - def handle_signal(self, signal: PGSignals) -> tuple: - """This method takes a signal, checks if the signal is allowed, given - the current state, and returns the new state. - If the walltime is up, it overwrites the signal with kill. + def kill(self): + """Kill the manager process""" - :param signal: The signal to consider - :type signal: Signal - :return: target state - :rtype: BaseState - :raises RuntimeError: raised, if the transition is not allowed - """ - - if signal in self._state.forbidden: - LOG.error(f"Signal not accepted {signal} for state {self._state} !") - return - - if self._walltime_expired(): - signal = PGSignals.KILL # goodbye + self._proc.kill() - if signal: - next_state = self.target_states[signal][0] - else: - next_state = self.current_state - - return next_state, self.current_state + def join(self, timeout=None): + """Join on the manager process - def transition_to(self, new_state: BaseState) -> BaseState: - """Transition the context to a new state. This does NOT run - the new state or updates the status. - - :param new_state: state to transition to - :type new_state: BaseState - :return: previous state - :rtype: BaseState + :param timeout: how long to wait before returning. If none, block. defaults to None + :type timeout: float, optional """ - - prior_state = self._state - - self._state = new_state - self._state.context = self - - return prior_state - - @property - def current_state(self) -> BaseState: - """Return the current state object. - - :return: the state - :rtype: BaseState - """ - - return self._state - - def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int): - """Execute the current (!) state in self._state. - - :param prior_state: prior state - :type prior_state: BaseState - """ - try: - self._state.run(self._state, prior_state, signal, sig_id) # Why ? - except Exception as e: # automagically transition to Error if something goes wrong - LOG.error(f"Exception in transition {prior_state} -> {self._state}: {e} ") - failed_state = self._state - self.transition_to(Error) - self.update_status(None) - self.run(self._state, None, sig_id) - - def get_next_sig_id(self) -> int: - """Obtain the next unique signal ID for this group context - - :return: next signal id - :rtype: int - """ - - # we need to communicate if a signal completed, so we need an ID for it. - - with self._signal_counter_lock: - sig_id = self._signal_counter.value - self._signal_counter.value = 1 + self._signal_counter.value - - return sig_id + if self.is_alive: + self._proc.join(timeout=timeout) + except AttributeError: + pass @property - def status(self) -> str: - """The current status of the Group. + def is_alive(self) -> bool: + """Whether the manager process is still alive - :return: state name - :rtype: str + :returns: True if alive. False otherwise + :rtype: {bool} """ - - item = self._status_queue.get(timeout=None) - self._status_queue.put(item) - state_name, _, __ = item - - return state_name - - @property - def puids(self) -> list[int or None]: - """The puids maintained by the GroupContext. - - :return: A list of puids if the process is started or None if not. - :rtype: list[int or None] + try: + return self._proc.is_alive + except Exception: + return False + + def _listen_for_pgsignals(self, + pg_sdesc_in: str, + pg_sdesc_out: str, + gc_sdesc_in: str, + gc_sdesc_out: str): + """Function executed as thread for listening to ProcessGroup requests from the user + + :param pg_sdesc_in: Channel used for receiving messages from user + :type pg_sdesc_in: str + :param pg_sdesc_out: Channel used for sending message back to user + :type pg_sdesc_out: str + :param gc_sdesc_in: Channel for receiving messages from thread managing ProcessGroup state + :type gc_sdesc_in: str + :param gc_sdesc_out: Channel for sending messages to thread managing ProcessGroup state + :type gc_sdesc_out: str """ + signal = None + exit_loop = False - item = self._status_queue.get(timeout=None) - self._status_queue.put(item) - _, puids, __ = item - - return puids - - @property - def last_completed_signal(self) -> int: - """Return the ID of the last successfully completed signal. - - :return: the signal id - :rtype: int - """ + pg_ch_in = Channel.attach(pg_sdesc_in) + pg_ch_out = Channel.attach(pg_sdesc_out) + pg_inout = Connection(inbound_initializer=pg_ch_in, + outbound_initializer=pg_ch_out) - item = self._status_queue.get(timeout=None) - self._status_queue.put(item) - _, __, compl_sig_id = item + gc_ch_in = Channel.attach(gc_sdesc_in) + gc_ch_out = Channel.attach(gc_sdesc_out) + gc_inout = Connection(inbound_initializer=gc_ch_in, + outbound_initializer=gc_ch_out) - return compl_sig_id + response_signals = [PGSignals.GROUP_STARTED, + PGSignals.GROUP_KILLED, + PGSignals.RETURN_TO_IDLE, + PGSignals.REQ_PUIDS_RESPONSE, + PGSignals.REQ_INACTIVE_PUIDS_RESPONSE] - def _start_group_once(self): - if not self._start_time: - self._start_time = time.monotonic() + while not exit_loop: - group_descr = create( - [(tup[0], self.messages[i].serialize()) for i, tup in enumerate(self.templates)], self.policy - ) + try: + signal, payload, sig_id = pg_inout.recv() + except Exception: + signal = payload = sig_id = None # no change - self._group_descr = group_descr - self.guid = group_descr.g_uid + if signal == PGSignals.NEW and payload: # new state object ! + self.group_state = payload - # construct the puid_to_message_map dict now that the processes are created - puids = [descr.uid for lst in group_descr.sets for descr in lst] - puid_idx = 0 - for i, tup in enumerate(self.templates): - for _ in range(tup[0]): - self.puid_to_message_map[puids[puid_idx]] = i - puid_idx += 1 + if signal == PGSignals.EXIT_LOOP: + exit_loop = True + elif signal in response_signals: + pg_inout.send(signal) + continue - def _walltime_expired(self) -> bool: - """Check if the walltime has expired + # this looks overengineered, but I need to cover Join : + # * Join.run depends on the prior state (Idle or Maintain), need to save it + # * transitioning is not followed by a run for Join->Idle + # * update_status has to be run multiple times by a Join + # * we update the status without running the state in Join + if self.group_state: + gc_inout.send((signal, sig_id)) - :return: True if the walltime has expired, False otherwise - :rtype: bool - """ + pg_inout.send(None) + gc_inout.send((None, None)) - if self.walltime and self._start_time > 0: - expired = time.monotonic() - self._start_time >= self.walltime - else: - expired = False + try: + pg_inout.ghost_close() + except Exception: + pass - return expired + try: + gc_inout.close() + except Exception: + pass - # this is a hack using a queue to communicate the status. We would want to - # use a few Array objects and a single lock to share the puids and status - # string. I need to report the last completed signal ID here, so that the - # corresponding client can be be sure his request has been completed. + try: + self._gc_ch_in.destroy() + except Exception: + pass - def update_status(self, compl_sig_id: int): - """update the global status of this context. + try: + self._gc_ch_out.destroy() + except Exception: + pass - :param compl_sig_id: signal just completed. If None, reuse the last value from the queue. - :type compl_sig_id: int + def _monitor_group_state(self, + gc_sdesc_in: str, + pg_sdesc_out: str, + state_sdesc_out: str): + """Function run as thread monitoring the ProcessGroup state via the ProcessGroupState management thread + + :param gc_sdesc_in: Channel for receiving messages from the Manager's listener thread + :type gc_sdesc_in: str + :param pg_sdesc_out: Channel for sending mesages to the user's ProcessGroup object + :type pg_sdesc_out: str + :param state_sdesc_out: Channel for sending signals to the ProcessGroupState's state runner thread + :type state_sdesc_out: str """ - _, __, last_sig_id = self._status_queue.get(timeout=None) - - if compl_sig_id == None: # this transition was automatic - compl_sig_id = last_sig_id + gc_ch_in = Channel.attach(gc_sdesc_in) + gc_in = Connection(inbound_initializer=gc_ch_in) - state_name = str(self._state.__name__) + pg_ch_out = Channel.attach(pg_sdesc_out) + pg_out = Connection(outbound_initializer=pg_ch_out) - if self._group_descr: - puids = [descr.uid for lst in self._group_descr.sets for descr in lst] - else: - puids = [None for _ in range(self.nproc)] - - item = (state_name, puids, compl_sig_id) - self._status_queue.put(item) + state_ch_in = Channel.attach(state_sdesc_out) + state_out = Connection(outbound_initializer=state_ch_in) + response_signals = [PGSignals.GROUP_STARTED, + PGSignals.GROUP_KILLED, + PGSignals.RETURN_TO_IDLE, + PGSignals.REQ_PUIDS_RESPONSE, + PGSignals.REQ_INACTIVE_PUIDS_RESPONSE] -class Manager: - def __init__(self): - """The Manager class holds a process that handles the life-cycle of all - process groups on this node. - We handle the group using a group_context class that holds all necessary - methods and attributes shared by manager and clients. - """ + # Wait until we know we have a state before I try referencing it + while not self.group_state: + sleep(0.001) - # I am not yet using a single node-wide process here, because I need to - # discover it and its communication queue, but I cannot ask Dragon for that - # named queue yet. + self.runner_thread = self.group_state.start_state_runner(state_sdesc_out, gc_sdesc_in) - node = current_node() + running = True + while running: - # missing shared memory implementation, so I am using a Queue - self.group_context = None + signal, sig_id = gc_in.recv() + if signal is None and sig_id is None: + running = False + break + elif signal in response_signals: + pg_out.send((signal, None, None)) + continue - ident = f"_DPoolQ1-{node.h_uid}-{id(self)}" - self.queue = Queue() # ident=ident) # need a named queue here + state_out.send((signal, sig_id)) - ident = f"_DPoolMan-{node.h_uid}-{id(self)}" - self._proc = Process( - self._signal_handler, - (self.queue,), - # ident=ident, - ) - self._proc.start() - - def __del__(self): + # Let the processgroup thread now we're done + pg_out.send((PGSignals.EXIT_LOOP, None, None)) + self.group_state.stop_state_runner() + try: + gc_in.ghost_close() + except Exception: + pass - # Kill processes and remove context try: - self.queue.put((PGSignals.STOP, None)) + pg_out.ghost_close() except Exception: pass - # wait for manager to exit try: - self._proc.join(timeout=self.update_interval_sec) + state_out.close() except Exception: pass - # make manager exit try: - self._proc.kill() # please go away + self._state_runner_out.destroy() except Exception: pass - def _signal_handler(self, queue: Queue): - """Get a new signal from the queue, obtain the new state, transition the - context, run the new context, update the status. - If the queue is empty, run the current context again. + def _signal_handler(self, + pg_ch_in_sdesc: str, pg_ch_out_sdesc: str, + gc_ch_in_sdesc: str, gc_ch_out_sdesc: str, + state_runner_out_sdesc: str): + """Function that is the 'Manager' and launched as a new Process to execute the ProcessGroup + + This function is only responsible for launching the listener and monitor threads to manage + ProcessGroup requests from the user and monitoring the state of the ProcessGroupState + object. It returns when the thread have finished their work. + + :param pg_sdesc_in: Channel used for receiving messages from user + :type pg_sdesc_in: str + :param pg_sdesc_out: Channel used for sending message back to user + :type pg_sdesc_out: str + :param gc_sdesc_in: Channel for receiving messages from thread managing ProcessGroup state + :type gc_sdesc_in: str + :param gc_sdesc_out: Channel for sending messages to thread managing ProcessGroup state + :type gc_sdesc_out: str + :param state_sdesc_out: Channel for sending signals to the ProcessGroupState's state runner thread + :type state_sdesc_out: str """ - signal = None - - while signal != PGSignals.STOP: - - try: - signal, payload, sig_id = queue.get(timeout=0) - except Empty: - signal = payload = sig_id = None # no change - - if signal == PGSignals.NEW and payload: # new context ! - self.group_context = payload + # Create one thread that listens for messages from the user via + # the ProcessGroup user API. Create another thread that monitors + # execution of jobs in the ProcessGroup via it's ProcessGroupState object + pgroup_listener_thread = threading.Thread(name="ProcessGroup listener", + target=self._listen_for_pgsignals, + args=(pg_ch_in_sdesc, pg_ch_out_sdesc, gc_ch_in_sdesc, gc_ch_out_sdesc), + daemon=False) - # this looks overengineered, but I need to cover Join : - # * Join.run depends on the prior state (Idle or Maintain), need to save it - # * transitioning is not followed by a run for Join->Idle - # * update_status has to be run multiple times by a Join - # * we update the status without running the state in Join - if self.group_context: - new_state, prior_state = self.group_context.handle_signal(signal) - self.group_context.transition_to(new_state) - self.group_context.run(prior_state, signal, sig_id) - self.group_context.update_status(sig_id) + gstate_monitor_thread = threading.Thread(name="ProcessGroupState monitorer", + target=self._monitor_group_state, + args=(gc_ch_out_sdesc, pg_ch_in_sdesc, state_runner_out_sdesc), + daemon=False) - time.sleep(self.group_context.update_interval_sec) # do not spin too hot + pgroup_listener_thread.start() + gstate_monitor_thread.start() - self.group_context = None + pgroup_listener_thread.join() + gstate_monitor_thread.join() class ProcessGroup: @@ -703,39 +355,79 @@ def __init__( self.pmi_enabled = pmi_enabled self.walltime = walltime self.policy = policy - self._group_context = None + self._group_state = None + + # We may catch exit of the state runner in a few places. We set this true + # to make sure we don't get hung in places + self._state_exit_signaled = False + + self._pg_ch_in = create(m_uid=default_pool_muid_from_index(this_process.index)) + self._pg_ch_out = create(m_uid=default_pool_muid_from_index(this_process.index)) + self._conn_options = ConnectionOptions(creation_policy=ConnectionOptions.CreationPolicy.PRE_CREATED) + self._pg_inout = Connection(inbound_initializer=self._pg_ch_in.sdesc, + outbound_initializer=self._pg_ch_out.sdesc, + options=self._conn_options) - def add_process(self, nproc: int, template: TemplateProcess) -> None: + def __del__(self): + + try: + self._pg_inout.send((PGSignals.EXIT_LOOP, None, None)) + except Exception: + pass + + try: + self._manager.join(timeout=1) + except Exception: + self._manager.kill() + finally: + del self._manager + + try: + self._pg_inout.close() + except Exception: + pass + + try: + self._pg_ch_in.destroy() + except Exception: + pass + + try: + self._pg_ch_out.destroy() + except Exception: + pass + + def add_process(self, nproc: int, template: ProcessTemplate) -> None: """Add processes to the ProcessGroup. :param template: single template processes, i.e. unstarted process objects - :type template: dragon.native.process.TemplateProcess + :type template: dragon.native.process.ProcessTemplate :param nproc: number of Dragon processes to start that follow the provided template :type nproc: int """ # if add_process is called after the ProcessGroup is initialized, then we raise - if self._group_context: - raise DragonProcessGroupError( - "You cannot call add_process() to already initialized ProcessGroup. Please use ProcessGroup.create_add_to() instead to add more processes." - ) + if self._group_state: + # TODO: Consider adding ProcessGroup.create_add_to() to allow users to add more template processes after init has been called + raise DragonProcessGroupError("You cannot call add_process() to already initialized ProcessGroup.") self.templates.append((nproc, template)) self.nproc += nproc def init(self) -> None: - """Initialize the GroupContext and Manager.""" + """Initialize the ProcessGroupState and Manager.""" - self._group_context = GroupContext( + self._group_state = ProcessGroupState( self.templates, self.nproc, self.restart, self.ignore_error_on_exit, self.pmi_enabled, self.walltime, - self.policy, + self.policy ) - self._manager = Manager() + + self._manager = Manager(self._pg_ch_out.sdesc, self._pg_ch_in.sdesc) self._send_signal(PGSignals.NEW) def start(self) -> None: @@ -743,12 +435,15 @@ def start(self) -> None: False`, transition to 'Running', otherwise transition to 'Maintain'. """ + # This needs to be done before we signal starting of the procs so the group + # state gets marked as critical if we're running inside a with block + if not self.restart: self._send_signal(PGSignals.JOIN) else: self._send_signal(PGSignals.START) - def join(self, timeout: float = None) -> None: + def join(self, timeout: float = None, save_puids: bool = False) -> None: """Wait for all processes to complete and the group to transition to Idle state. If the group status is 'Maintain', transition to 'Running'. @@ -762,17 +457,30 @@ def join(self, timeout: float = None) -> None: start = time.monotonic() if self.status == str(Maintain()): - self._send_signal(PGSignals.JOIN) + if save_puids: + self._send_signal(PGSignals.JOIN_SAVE) + else: + self._send_signal(PGSignals.JOIN) stop = time.monotonic() - if timeout == None: + if timeout is None: + _ = self._pg_inout.recv() + else: + try: + timeout = max(0, timeout - (stop - start)) + if self._pg_inout.poll(timeout=timeout): + _ = self._pg_inout.recv() + except TimeoutError: + raise TimeoutError("Timeout waiting for status Idle") + + # This is here just to make sure tests pass till they get updated. This is as close to a non-op as + # possible at this point in the code + if timeout is None: timeout = 100000000 - timeout = max(0, timeout - (stop - start)) + self._wait_for_status(str(Idle()), timeout=timeout) - return self._wait_for_status(str(Idle()), timeout=timeout) - - def kill(self, signal: signal.Signals = signal.SIGKILL) -> None: + def kill(self, signal: signal.Signals = signal.SIGKILL, save_puids: bool = False) -> None: """Send a signal to each process of the process group. The signals SIGKILL and SIGTERM have the following side effects: @@ -786,30 +494,68 @@ def kill(self, signal: signal.Signals = signal.SIGKILL) -> None: """ if signal == signal.SIGTERM: - self._send_signal(PGSignals.SHUTDOWN) + if save_puids: + self._send_signal(PGSignals.SHUTDOWN_SAVE) + else: + self._send_signal(PGSignals.SHUTDOWN) elif signal == signal.SIGKILL: - self._send_signal(PGSignals.KILL) + if save_puids: + self._send_signal(PGSignals.KILL_SAVE) + else: + self._send_signal(PGSignals.KILL) else: for puid in self.puids: process_kill(puid, sig=signal) - def stop(self) -> None: + def stop(self, save_puids=False) -> None: """Forcibly terminate all workers by sending `SIGKILL` from any state, transition to `Stop`. This also removes the group from the manager process and marks the end of the group life-cycle. """ - - self._send_signal(PGSignals.STOP) + if save_puids: + self._send_signal(PGSignals.STOP_SAVE) + else: + self._send_signal(PGSignals.STOP) @property def puids(self) -> list[int]: """Return the puids of the processes contained in this group. :return: a list of puids - :rtype: list[int or None] + :rtype: list[int] + """ + # Send a request to process group to make sure this is up-to-date + if self._manager.is_alive: + self._send_signal(PGSignals.REQ_PUIDS) + + return self._group_state.puids + + @property + def inactive_puids(self) -> List[Tuple[int, int]]: + """Return the group's puids and their exit codes that have exited + + :returns: a list of tuples (puid, exit_code) + :rtype: List[Tuple[int, int]] """ + # Send a request to process group to make sure this is up-to-date + if self._manager.is_alive: + self._send_signal(PGSignals.REQ_INACTIVE_PUIDS) - return self._group_context.puids + return self._group_state.inactive_puids + + @property + def exit_status(self) -> List[Tuple[int, int]]: + """Return the group's puids and their exit codes that have exited + + :returns: a list of tuples (puid, exit_code) + :rtype: List[Tuple[int, int]] + """ + + return self.inactive_puids + + def _get_status(self): + + self._pg_inout.send((signal, None, None)) @property def status(self) -> str: @@ -818,37 +564,58 @@ def status(self) -> str: :returns: current status of the group :rtype: str """ + stat = self._group_state.status - return self._group_context.status + return stat # Private interface - def _send_signal(self, signal: PGSignals) -> bool: """Send the signal to the manager and wait for the response. The method guarantees completion of the signal by the manager, nothing more. I.e. the processes may have been started, but not actually executed any useful code yet. - In case of sending IDLE, include the group context as well. + In case of sending IDLE, include the group state as well. """ - status = self.status - if signal in self._group_context.forbidden[status]: + if signal in self._group_state.forbidden[status] and signal not in [PGSignals.REQ_PUIDS, PGSignals.REQ_INACTIVE_PUIDS]: raise DragonProcessGroupError(f"Signal {str(signal)} is not a valid transition from {status}") if signal == PGSignals.NEW: - payload = self._group_context + payload = self._group_state else: payload = None - sig_id = self._group_context.get_next_sig_id() + sig_id = self._group_state.get_next_sig_id() + self._pg_inout.send((signal, payload, sig_id)) + + # The following are all messages that block on a receipt message + if signal == PGSignals.START or (signal == PGSignals.JOIN and self.status == str(Idle())): + msg = self._pg_inout.recv() + + elif signal in [PGSignals.SHUTDOWN, PGSignals.SHUTDOWN_SAVE]: + msg = self._pg_inout.recv() + + elif signal in [PGSignals.REQ_PUIDS, PGSignals.REQ_INACTIVE_PUIDS]: + break_loop = False + while not break_loop: + msg = self._pg_inout.recv() + if msg in [PGSignals.REQ_PUIDS_RESPONSE, PGSignals.REQ_INACTIVE_PUIDS_RESPONSE, None]: + break_loop = True + if msg is None: + # None means the state manager has begun its exit and isn't going to return requests + self._state_exit_signaled = True + return - self._manager.queue.put((signal, payload, sig_id)) + # Before entering the loop, make sure we haven't already been told of the state runner's exit: + if self._state_exit_signaled: + return - while self._group_context.last_completed_signal < sig_id: + # Otherwise, enter the loop here + while self._group_state.last_completed_signal < sig_id: if self.status == str(Error()): - if signal not in [PGSignals.KILL, PGSignals.STOP]: - if self._group_context.last_completed_signal == sig_id - 1: + if signal not in [PGSignals.KILL, PGSignals.KILL_SAVE, PGSignals.STOP, PGSignals.STOP_SAVE]: + if self._group_state.last_completed_signal == sig_id - 1: raise DragonProcessGroupError( f"Signal {str(signal)} was not successful. Group in ERROR state." ) @@ -857,19 +624,26 @@ def _send_signal(self, signal: PGSignals) -> bool: f"Signal {str(signal)} cannot be completed. Group in ERROR state" ) - time.sleep(self._group_context.update_interval_sec) + # If the ProcessGroupState has exited because all processes have finished, the Manager will + # have told me + try: + if self._pg_inout.poll(): + msg = self._pg_inout.recv() + if msg is None: + # We end up here because the GUID has been killed + break + except Exception: + pass - def _wait_for_status(self, status: str, timeout: float = None) -> None: + # Don't overwhelm the state object while we wait for the request to complete + sleep(0.001) - sleep_time = self._group_context.update_interval_sec + def _wait_for_status(self, status: str, timeout: float = 0.0) -> None: start = time.monotonic() - - while not self._group_context.status == status: - + while not self.status == status: dt = timeout - time.monotonic() + start if dt <= 0: raise TimeoutError(f"Timeout waiting for status {status}") - - time.sleep(min(sleep_time, dt)) + sleep(0.1) diff --git a/src/dragon/native/value.py b/src/dragon/native/value.py index 159132d..be084b4 100644 --- a/src/dragon/native/value.py +++ b/src/dragon/native/value.py @@ -100,7 +100,13 @@ def __init__(self, typecode_or_type, value: int = 0, m_uid: int = _DEF_MUID): valbytes = self._value2valbytes(value) # create value in shared memory - msg = Message.create_alloc(self._mpool, 8) + mpool = self._channel.get_pool() + # if the channel isn't local, then fall back on using the default + # allocation pool + if not mpool.is_local: + mpool = self._channel.default_alloc_pool + + msg = Message.create_alloc(mpool, 8) mview = msg.bytes_memview() mview[: len(valbytes)] = valbytes diff --git a/src/dragon/pydragon_channels.pyx b/src/dragon/pydragon_channels.pyx index d499720..8a6c75f 100644 --- a/src/dragon/pydragon_channels.pyx +++ b/src/dragon/pydragon_channels.pyx @@ -1,6 +1,7 @@ from dragon.dtypes_inc cimport * from dragon.channels cimport * from dragon.managed_memory cimport * +from dragon.rc import DragonError import dragon.dtypes as dtypes import dragon.infrastructure.parameters as dparms from dragon.utils import B64 @@ -215,16 +216,31 @@ cdef class Message: # @MCB: Should this and create_alloc be combined into a regular __init__()? # create_empty is only used by Receive Handle objects @staticmethod - def create_from_mem(MemoryAlloc mem): + def create_from_mem(MemoryAlloc mem, hints = 0, clientid = 0): """ Create a new Message object with no memory backing :return: New Message Object """ - cdef dragonError_t derr + cdef: + dragonError_t derr + dragonMessageAttr_t attrs msg = Message() - derr = dragon_channel_message_init(&msg._msg, &mem._mem_descr, NULL) + derr = dragon_channel_message_attr_init(&attrs); + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not create default message attributes", derr) + + if (not type(hints) is int) or hints < 0: + raise ChannelError("The hints argument must be a non-negative integer.", DragonError.INVALID_ARGUMENT) + + if (not type(clientid) is int) or clientid < 0: + raise ChannelError("The clientid argument must be a non-negative integer.", DragonError.INVALID_ARGUMENT) + + attrs.hints = hints + attrs.clientid = clientid + + derr = dragon_channel_message_init(&msg._msg, &mem._mem_descr, &attrs) if derr != DRAGON_SUCCESS: raise ChannelError("Could not create empty message", derr) msg._allocated = 0 @@ -250,7 +266,7 @@ cdef class Message: return msg @staticmethod - def create_alloc(MemoryPool mpool, size_t nbytes, timeout=None): + def create_alloc(MemoryPool mpool, size_t nbytes, hints = 0, clientid = 0, timeout=None): """ Allocate memory and a new message object for inserting data into and sending @@ -264,7 +280,20 @@ cdef class Message: dragonMemoryDescr_t * mem timespec_t alloc_timeout timespec_t timer - timespec_t* time_ptr + dragonMessageAttr_t attrs + + derr = dragon_channel_message_attr_init(&attrs); + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not create default message attributes", derr) + + if (not type(hints) is int) or hints < 0: + raise ChannelError("The hints argument must be a non-negative integer.", DragonError.INVALID_ARGUMENT) + + if (not type(clientid) is int) or clientid < 0: + raise ChannelError("The clientid argument must be a non-negative integer.", DragonError.INVALID_ARGUMENT) + + attrs.hints = hints + attrs.clientid = clientid time_ptr = _compute_timeout(timeout, NULL, &timer) @@ -281,7 +310,7 @@ cdef class Message: if merr != DRAGON_SUCCESS: raise ChannelError("Could not get size memory descriptor", merr) - derr = dragon_channel_message_init(&msg._msg, mem, NULL) + derr = dragon_channel_message_init(&msg._msg, mem, &attrs) if derr != DRAGON_SUCCESS: raise ChannelError("Could not create message", derr) @@ -349,6 +378,68 @@ cdef class Message: mview = self.bytes_memview() return mview.tobytes() + @property + def hints(self): + cdef: + dragonError_t derr + dragonMessageAttr_t attrs + + derr = dragon_channel_message_getattr(&self._msg, &attrs) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not get message attributes", derr) + + return attrs.hints + + @hints.setter + def hints(self, value): + cdef: + dragonError_t derr + dragonMessageAttr_t attrs + + if (not type(value) is int) or value < 0: + raise ChannelError("The hints attribute must be a non-negative integer.", DragonError.INVALID_ARGUMENT) + + derr = dragon_channel_message_getattr(&self._msg, &attrs) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not get message attributes", derr) + + attrs.hints = value + + derr = dragon_channel_message_setattr(&self._msg, &attrs) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not set message attributes", derr) + + @property + def clientid(self): + cdef: + dragonError_t derr + dragonMessageAttr_t attrs + + derr = dragon_channel_message_getattr(&self._msg, &attrs) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not get message attributes", derr) + + return attrs.clientid + + @clientid.setter + def clientid(self, value): + cdef: + dragonError_t derr + dragonMessageAttr_t attrs + + if (not type(value) is int) or value < 0: + raise ChannelError("The clientid attribute must be a non-negative integer.", DragonError.INVALID_ARGUMENT) + + derr = dragon_channel_message_getattr(&self._msg, &attrs) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not get message attributes", derr) + + attrs.clientid = value + + derr = dragon_channel_message_setattr(&self._msg, &attrs) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not set message attributes", derr) + cpdef enum OwnershipOnSend: copy_on_send @@ -942,6 +1033,68 @@ cdef class Channel: return empty_channel._attach(serialized_bytes, mem_pool) + @classmethod + def make_process_local(cls, timeout=None): + """ + Create a process local channel which is a channel that exists for the + sole purpose of use by the current process. The channel is unique + to all nodes and may be shared with other processes but is + managed with the life-cycle of the current process. When the + current process, the one calling this function, exits, the + process local channel it created via this call will also be + destroyed. + + This is especially useful for processes that need a channel to receive + requests or need to have a place where responses to requests of + other processes can be sent. Most likely calls to this function + will exist inside of some other API. + + :param timeout: Default is None which means to block without timeout until the + channel is made. This should not timeout and should be processed quickly. If a + timeout value is specified, it is the number of seconds to wait which may be a + float. + + :return: A new channel object. + + :raises: ChannelError if there was an error. Note that the Dragon run-time + must be running to use this function as it interacts with Local Services on the + node on which it is called. + """ + cdef: + dragonError_t derr + timespec_t * time_ptr + timespec_t val_timeout + dragonChannelDescr_t ch + dragonChannelSerial_t ser + + if timeout is None: + time_ptr = NULL + elif isinstance(timeout, int) or isinstance(timeout, float): + if timeout < 0: + raise ValueError('Cannot provide timeout < 0 to make_process_local operation') + # Anything > 0 means use that as seconds for timeout. + time_ptr = & val_timeout + val_timeout.tv_sec = int(timeout) + val_timeout.tv_nsec = int((timeout - val_timeout.tv_sec)*1000000000) + else: + raise ValueError('make_process_local timeout must be a float or int') + + with nogil: + derr = dragon_create_process_local_channel(&ch, time_ptr) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not create process local channel", derr) + + derr = dragon_channel_serialize(&ch, &ser) + if derr != DRAGON_SUCCESS: + raise ChannelError("Could not serialize channel", derr) + + py_obj = ser.data[:ser.len] + + derr = dragon_channel_serial_free(&ser) + + # This inits the rest of the object given the channel descriptor above. + return Channel.attach(py_obj) + def destroy(self): """ Destroys the channel. @@ -1398,6 +1551,14 @@ cdef _ConnMsgHeader_parse(uint8_t * first_bytes, size_t length): elif first_bytes[0] == _ConnMsgHeader_PROTOCOL_5_FIRSTB and first_bytes[1] == _ConnMsgHeader_PROTOCOL_5_NEXTB: return ChannelAdapterMsgTypes.PICKLE_PROT_5, None elif first_bytes[0] == _ConnMsgHeader_RAW_BYTE_MSG_FIRSTB and length >= _ConnMsgHeader_RAW_HDR_SZ: + # The following if statement is temporary code. It is needed to allow + # C/C++ code to send byte encoded CapNProto messages to connections. + # This code is only needed until we switch over to FLI adapters for + # infrastructure connections. This is needed because the size of + # the CapNProto message is not known until it is written. + if decoded_length == 0xFFFFFFFFFFFFFF: + decoded_length = length - _ConnMsgHeader_RAW_HDR_SZ + return ChannelAdapterMsgTypes.RAW_BYTES, decoded_length else: raise ConnectionError(f'unexpected header: {first_bytes[:8]}') @@ -1761,7 +1922,7 @@ cdef class Peer2PeerWritingChannelFile: self.flush() derr = dragon_memory_free(&self._small_blk_descr) - if derr != DRAGON_MAP_KEY_NOT_FOUND and derr != DRAGON_SUCCESS: + if derr != DRAGON_SUCCESS: raise ChannelError("Could not free small message send buffer", derr) finally: @@ -2589,6 +2750,20 @@ cdef class GatewayMessage: else: raise ChannelError('Attempt to get the sendhid of a non-send Gateway message') + @property + def send_payload_message_attr_clientid(self): + if self.is_send_kind: + return self._gmsg.send_payload_message._attr.clientid + else: + raise ChannelError('Attempt to get the clientid of a non-send Gateway message') + + @property + def send_payload_message_attr_hints(self): + if self.is_send_kind: + return self._gmsg.send_payload_message._attr.hints + else: + raise ChannelError('Attempt to get the hints of a non-send Gateway message') + @property def send_dest_mem_descr_ser(self): """ diff --git a/src/dragon/pydragon_fli.pyx b/src/dragon/pydragon_fli.pyx index 93b3eb2..d9ea3aa 100644 --- a/src/dragon/pydragon_fli.pyx +++ b/src/dragon/pydragon_fli.pyx @@ -6,6 +6,8 @@ import dragon.infrastructure.parameters as dparms import dragon.infrastructure.facts as dfacts import dragon.globalservices.channel as dgchan from dragon.localservices.options import ChannelOptions +from dragon.rc import DragonError +import sys BUF_READ = PyBUF_READ BUF_WRITE = PyBUF_WRITE @@ -33,6 +35,11 @@ cdef timespec_t* _computed_timeout(timeout, timespec_t* time_ptr): return time_ptr class DragonFLIError(Exception): + """ + The DragonFLIError is an exception that can be caught that explicitly targets + those errors generated by the FLI code. The string associated with the + exception includes any traceback avaialable from the C level interaction. + """ def __init__(self, lib_err, msg): cdef char * errstr = dragon_getlasterrstr() @@ -46,21 +53,67 @@ class DragonFLIError(Exception): def __str__(self): return f"FLI Exception: {self.msg}\n*** Dragon C-level Traceback: ***\n{self.lib_msg}\n*** End C-level Traceback: ***\nDragon Error Code: {self.lib_err}" +class DragonFLITimeoutError(DragonFLIError, TimeoutError): + pass + class FLIEOT(DragonFLIError, EOFError): + """ + The FLIEOT Exception is used to indicate the end of stream for an + FLI conversation. This Exception inherits from EOFError so applications + using the FLI may choose to catch EOFError instead. + """ pass cdef class FLISendH: """ - Sending handle for FLInterfaces + Sending handle for FLInterfaces. A send handle is needed when sending + data. Proper use of a send handle includes creating it (which also opens + it for sending), sending data with one or more to the send operations, + and closing it once data transmission is complete. """ cdef: dragonFLISendHandleDescr_t _sendh dragonFLIDescr_t _adapter bool _is_open + object _default_timeout def __init__(self, FLInterface adapter, Channel stream_channel=None, timeout=None, use_main_as_stream_channel=False): + """ + When creating a send handle an application may provide a stream + channel to be used. If specifying that the main channel is to be + used as a stream channel then both sender and receiver must agree + to this. Both send and receive handle would need to be specified + using the use_main_as_stream_channel in that case. + + :param adapter: An FLI over which to create a send handle. + + :param stream_channel: Default is None. The sender may supply a stream + channel when opening a send handle. If the FLI is created with + stream channels, then the value of the argument may be None. If + supplied by a user then the main channel of the FLI must exist. + If use_main_as_stream_channel is True, this argument must be + None. + + :param use_main_as_stream_channel: Default is False. If True, then both + send handle and receive handle must be true. This would indicate + that both sender and receiver are agreeing they are the only + sender and the only receiver and they wish to use the single main + channel as the stream channel. This can be useful in some + restricted circumstances but must only be used when there is + exactly one sender and one receiver on the FLI. + + :param timeout: Default is None. None means to block forever. Otherwise + the timeout should be some number of seconds to wait for the + operation to complete. The operation could timeout when not + supplying a stream channel and there is no channel available + during the specified amount of time in the manager channel. The timeout + provided here also becomes the default timeout when used in the context + manager framework. + + :return: An FLI send handle. + """ cdef: dragonError_t derr dragonChannelDescr_t * c_strm_ch = NULL @@ -79,12 +132,27 @@ cdef class FLISendH: with nogil: derr = dragon_fli_open_send_handle(&self._adapter, &self._sendh, c_strm_ch, time_ptr) + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Timed out while opening send handle.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not open send handle stream.") self._is_open = True + self._default_timeout = timeout + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close(timeout=self._default_timeout) def close(self, timeout=None): + """ + When the conversation is complete the send handle should be closed. In the case of a + buffered FLI, no data is sent until the send handle is closed. In all cases, closing + the send handle indicates the end of the stream for the receiver. + """ cdef: dragonError_t derr timespec_t timer @@ -98,6 +166,9 @@ cdef class FLISendH: with nogil: derr = dragon_fli_close_send_handle(&self._sendh, time_ptr) + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Timed out while closing send handle.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not close send handle stream.") @@ -110,6 +181,12 @@ cdef class FLISendH: pass def send_bytes(self, bytes data, uint64_t arg=0, bool buffer=False, timeout=None): + """ + When sending bytes it is possible to specify the bytes to be sent. In addition, + you may specify a user specified argument or hint to be sent. If buffer is true, then + data is not actually sent on this call, but buffered for future call or until the send + handle is closed. + """ cdef: dragonError_t derr #uint8_t * c_data @@ -129,23 +206,35 @@ cdef class FLISendH: with nogil: derr = dragon_fli_send_bytes(&self._sendh, data_len, &c_data[0], arg, buffer, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while sending bytes.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Failed to send message over stream channel.") - def send_mem(self, MemoryAlloc mem, uint64_t arg=0, timeout=None): + + def send_mem(self, MemoryAlloc mem, uint64_t arg=0, transfer_ownership=True, timeout=None): cdef: dragonError_t derr timespec_t timer timespec_t* time_ptr + bool _transfer + if self._is_open == False: raise RuntimeError("Handle not open, cannot send data.") time_ptr = _computed_timeout(timeout, &timer) arg_val = arg + _transfer = transfer_ownership with nogil: - derr = dragon_fli_send_mem(&self._sendh, &mem._mem_descr, arg, time_ptr) + derr = dragon_fli_send_mem(&self._sendh, &mem._mem_descr, arg, _transfer, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while sending memory.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Failed to send memory over stream channel.") @@ -168,6 +257,10 @@ cdef class FLISendH: with nogil: derr = dragon_fli_create_writable_fd(&self._sendh, &fdes, buffered, chunk_size, user_arg, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while creating writable file descriptor.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not open writeable file descriptor.") @@ -186,6 +279,10 @@ cdef class FLISendH: with nogil: derr = dragon_fli_finalize_writable_fd(&self._sendh) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while finalizing the writable file descriptor.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not finalize writable file descriptor") @@ -193,17 +290,48 @@ cdef class FLISendH: cdef class FLIRecvH: """ - Receiving handle for FLInterfaces + Receiving handle for FLInterfaces. """ cdef: dragonFLIRecvHandleDescr_t _recvh dragonFLIDescr_t _adapter bool _is_open + object _default_timeout def __init__(self, FLInterface adapter, Channel stream_channel=None, timeout=None, use_main_as_stream_channel=False): """ - Open the handle, optionally with a specific stream channel and timeout + If specifying that the main channel is to be + used as a stream channel then both sender and receiver must agree + to this. Both send and receive handle would need to be specified + using the use_main_as_stream_channel in that case. + + :param adapter: An FLI over which to create a send handle. + + :param stream_channel: Default is None. The receiver may supply a stream + channel when opening a receive handle. If the FLI is created with + stream channels, then the value of the argument may be None. If + supplied by a user then the manager channel of the FLI must exist. + If use_main_as_stream_channel is True, this argument must be + None. + + :param use_main_as_stream_channel: Default is False. If True, then both + send handle and receive handle must be true. This would indicate + that both sender and receiver are agreeing they are the only + sender and the only receiver and they wish to use the single main + channel as the stream channel. This can be useful in some + restricted circumstances but must only be used when there is + exactly one sender and one receiver on the FLI. + + :param timeout: Default is None. None means to block forever. Otherwise + the timeout should be some number of seconds to wait for the + operation to complete. The operation could timeout when not + supplying a stream channel and there is no channel available + during the specified amount of time in the manager channel. The timeout + provided here also becomes the default timeout when used in the context + manager framework. + + :return: An FLI send handle. """ cdef: dragonError_t derr @@ -224,10 +352,43 @@ cdef class FLIRecvH: with nogil: derr = dragon_fli_open_recv_handle(&self._adapter, &self._recvh, c_strm_ch, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while opening receive handle.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not open receive handle stream") self._is_open = True + self._default_timeout = timeout + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + mem_discarded = 0 + try: + while not self.stream_received: + mem = None + try: + mem, hint = self.recv_mem() + except EOFError: + pass + + if mem is not None: + mem.free() + mem_discarded += 1 + + self.close(self._default_timeout) + except Exception as ex: + try: + self.close(self._default_timeout) + except: + pass + raise ex + + if mem_discarded > 1: + raise DragonFLIError(DragonError.INVALID_MESSAGE, 'There was message data discarded while closing the FLI recv handle.') def close(self, timeout=None): cdef: @@ -242,11 +403,28 @@ cdef class FLIRecvH: with nogil: derr = dragon_fli_close_recv_handle(&self._recvh, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while closing receive handle.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not close receive handle stream") self._is_open = False + @property + def stream_received(self): + cdef: + dragonError_t derr + bool result + + derr = dragon_fli_stream_received(&self._recvh, &result) + + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to get the stream received property") + + return result + def __del__(self): try: self.close(timeout=DEFAULT_CLOSE_TIMEOUT) @@ -273,15 +451,20 @@ cdef class FLIRecvH: # To pass in as a pointer, get the address of the 0th index &c_data[0] with nogil: derr = dragon_fli_recv_bytes_into(&self._recvh, max_bytes, &num_bytes, &c_data[0], &arg, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while receiving bytes into.") + if derr == DRAGON_EOT: raise FLIEOT(derr, "End of Transmission") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not receive into bytes buffer") # Landing pad should be populated, just return arg return arg - def recv_bytes(self, timeout=None): + def recv_bytes(self, size=-1, timeout=None): cdef: dragonError_t derr size_t num_bytes @@ -296,11 +479,21 @@ cdef class FLIRecvH: time_ptr = _computed_timeout(timeout, &timer) + if size > 0: + max_bytes = size + # A max_bytes value of 0 means "get everything" with nogil: derr = dragon_fli_recv_bytes(&self._recvh, max_bytes, &num_bytes, &c_data, &arg, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while receiving bytes.") + if derr == DRAGON_EOT: + if num_bytes > 0: + free(c_data) raise FLIEOT(derr, "End of Transmission") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Error receiving FLI data") @@ -310,6 +503,7 @@ cdef class FLIRecvH: py_bytes = py_view.tobytes() # Release underlying malloc now that we have a copy free(c_data) + c_data = NULL # Return data and metadata as a tuple return (py_bytes, arg) @@ -328,8 +522,15 @@ cdef class FLIRecvH: with nogil: derr = dragon_fli_recv_mem(&self._recvh, &mem, &arg, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while receiving memory.") + if derr == DRAGON_EOT: + with nogil: + dragon_memory_free(&mem) raise FLIEOT(derr, "End of Transmission") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Error receiving FLI data into memory object") @@ -353,6 +554,10 @@ cdef class FLIRecvH: with nogil: derr = dragon_fli_create_readable_fd(&self._recvh, &fdes, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while creating readable file descriptor.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not open readable file descriptor") @@ -371,6 +576,10 @@ cdef class FLIRecvH: with nogil: derr = dragon_fli_finalize_readable_fd(&self._recvh) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while finalizing the readable file descriptor.") + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not finalize readable file descriptor") @@ -386,16 +595,17 @@ cdef class FLInterface: dragonFLIDescr_t _adapter dragonFLISerial_t _serial bool _is_serialized + bool _is_buffered list stream_channel_list MemoryPool pool def __getstate__(self): - return (self.serialize(),self.pool) + return (self.serialize(), self.pool) def __setstate__(self, state): serial_fli, pool = state - if not pool.is_local: + if pool is None or not pool.is_local: pool = None self._attach(serial_fli, pool) @@ -406,21 +616,28 @@ cdef class FLInterface: dragonFLISerial_t _serial dragonMemoryPoolDescr_t * mpool = NULL + if len(ser_bytes) == 0: + raise DragonFLIError(DragonError.INVALID_ARGUMENT, "The serialized bytes where empty.") + _serial.len = len(ser_bytes) cdef const unsigned char[:] cdata = ser_bytes _serial.data = &cdata[0] self._is_serialized = False + self.pool = pool if pool is not None: mpool = &pool._pool_hdl - self.pool = pool - else: - self.pool = None derr = dragon_fli_attach(&_serial, mpool, &self._adapter) + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Could not attach to FLI adapter") + derr = dragon_fli_is_buffered(&self._adapter, &self._is_buffered) + + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to get the is buffered property") + return self def __del__(self): @@ -437,14 +654,17 @@ cdef class FLInterface: dragonChannelDescr_t * c_mgr_ch = NULL dragonMemoryPoolDescr_t * c_pool = NULL Channel ch # Necessary to cast python objects into cython objects when pulling out stream_channel values + dragonULInt num_stream_channels self._is_serialized = False + self.pool = pool ### ### If creating main and manager channels, make sure their capacity is set to the number of stream channels ### num_stream_channels = len(stream_channels) + self._is_buffered = use_buffered_protocol if pool is None and main_ch is None: # Get default pool muid and create a main_channel from there @@ -452,10 +672,6 @@ cdef class FLInterface: ch_options = ChannelOptions(capacity=num_stream_channels) main_ch = dgchan.create(default_muid, options=ch_options) - if pool is None and main_ch is not None: - # Do nothing, C code handles this - pass - # Get pointers to the handles # This simplifies the actual C call since the pointers will either be NULL or assigned to the objects handle if main_ch is not None: @@ -466,9 +682,6 @@ cdef class FLInterface: if pool is not None: c_pool = &pool._pool_hdl - self.pool = pool - else: - self.pool = None if num_stream_channels > 0: strm_chs = malloc(sizeof(dragonChannelDescr_t*) * num_stream_channels) @@ -476,11 +689,13 @@ cdef class FLInterface: ch = stream_channels[i] strm_chs[i] = &ch._channel - derr = dragon_fli_create(&self._adapter, c_main_ch, c_mgr_ch, c_pool, - num_stream_channels, strm_chs, use_buffered_protocol, NULL) + with nogil: + derr = dragon_fli_create(&self._adapter, c_main_ch, c_mgr_ch, c_pool, + num_stream_channels, strm_chs, use_buffered_protocol, NULL) if strm_chs != NULL: free(strm_chs) # Free our Malloc before error checking to prevent memory leaks + strm_chs = NULL if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Failed to create new FLInterface") @@ -497,16 +712,38 @@ cdef class FLInterface: def destroy(self): cdef dragonError_t derr - derr = dragon_fli_destroy(&self._adapter) + with nogil: + derr = dragon_fli_destroy(&self._adapter) + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Failed to destroy FLInterface") + def num_available_streams(self, timeout=None): + cdef: + dragonError_t derr + uint64_t count + timespec_t timer + timespec_t* time_ptr + + time_ptr = _computed_timeout(timeout, &timer) + + with nogil: + derr = dragon_fli_get_available_streams(&self._adapter, &count, time_ptr) + + if derr == DRAGON_TIMEOUT: + raise DragonFLITimeoutError(derr, "Time out while getting the number of available streams.") + + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to get the available streams") + + return count def serialize(self): cdef dragonError_t derr if not self._is_serialized: derr = dragon_fli_serialize(&self._adapter, &self._serial) + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Failed to serialize FLInterface") @@ -525,6 +762,7 @@ cdef class FLInterface: cdef dragonError_t derr derr = dragon_fli_detach(&self._adapter) + if derr != DRAGON_SUCCESS: raise DragonFLIError(derr, "Failed to detach from FLI adapter") @@ -539,3 +777,8 @@ cdef class FLInterface: Return a new FLI Recv Handle object """ return FLIRecvH(self, *args, **kwargs) + + @property + def is_buffered(self): + return self._is_buffered + diff --git a/src/dragon/pydragon_managed_memory.pyx b/src/dragon/pydragon_managed_memory.pyx index 64fd497..ab8eeb9 100644 --- a/src/dragon/pydragon_managed_memory.pyx +++ b/src/dragon/pydragon_managed_memory.pyx @@ -1,5 +1,7 @@ from dragon.dtypes_inc cimport * import enum +import sys + ################################ # Begin Cython definitions @@ -123,6 +125,37 @@ cdef class MemoryAlloc: if self._is_serial == 1: dragon_memory_serial_free(&self._mem_ser) + def __hash__(self): + """ + Get a hash value for the underlying memory + + :return: an integer hash value. + """ + cdef: + dragonError_t derr + dragonULInt hash_val + + derr = dragon_memory_hash(&self._mem_descr, &hash_val) + if derr != DRAGON_SUCCESS: + raise DragonMemoryError(derr, "Could not get memory hash") + + return hash_val + + def __eq__(self, MemoryAlloc other): + cdef: + dragonError_t derr + bool result + + # not sure this is needed, but doesn't hurt. + if (type(other) is not MemoryAlloc): + raise DragonMemoryError(DRAGON_INVALID_OPERATION, "Cannot compare MemoryAlloc with value of different type.") + + derr = dragon_memory_equal(&self._mem_descr, &other._mem_descr, &result) + if derr != DRAGON_SUCCESS: + raise DragonMemoryError(derr, "Could not compare memory allocations") + + return result + def get_memview(self): """ Get a memoryview of the underlying memory @@ -232,6 +265,7 @@ cdef class MemoryAlloc: cdef: dragonError_t derr + # @MCB TODO: Does this still make sense? if self._is_attach == 0: raise RuntimeError("cannot detach from memory not attached to") @@ -243,16 +277,88 @@ cdef class MemoryAlloc: self._is_attach = 0 def free(self): + """ + Free the managed memory allocation. + + :return: None + :raises: DragonMemoryError + """ cdef dragonError_t derr - derr = dragon_memory_free(&self._mem_descr) + with nogil: + derr = dragon_memory_free(&self._mem_descr) if derr != DRAGON_SUCCESS: raise DragonMemoryError(derr, "could not free allocation") + def copy(self, pool:MemoryPool, timeout=None): + """ + Copy an allocation into a pool + + :param pool: The pool in which to copy it. + :return: A new memory allocation + :raises: DragonMemoryError + """ + + cdef: + dragonError_t err + dragonMemoryDescr_t to_mem + dragonMemorySerial_t ser_mem + timespec_t timer + timespec_t* time_ptr + + if timeout is None: + time_ptr = NULL + elif isinstance(timeout, int) or isinstance(timeout, float): + if timeout < 0: + raise ValueError('Cannot provide timeout < 0') + # Anything > 0 means use that as seconds for timeout. + time_ptr = &timer + timer.tv_sec = int(timeout) + timer.tv_nsec = int((timeout - timer.tv_sec)*1000000000) + else: + raise ValueError('The timeout must be a float or int') + + err = dragon_memory_copy(&self._mem_descr, &to_mem, &pool._pool_hdl, time_ptr) + if err != DRAGON_SUCCESS: + raise DragonMemoryError(err, "Could not copy memory.") + + mem_alloc_obj = MemoryAlloc.cinit(to_mem) + return mem_alloc_obj + @property def size(self): + """ + Return the size of the allocation + + return: the size in bytes of the memory allocation + """ return self._mem_size + @property + def pool(self): + """ + Return the pool of this memory allocation. + + :return: pool where this resides. + :raises: DragonMemoryError + """ + cdef: + dragonError_t err + dragonMemoryPoolDescr_t pool_descr + dragonMemoryPoolSerial_t ser_pool + dragonULInt muid + + err = dragon_memory_get_pool(&self._mem_descr, &pool_descr) + if err != DRAGON_SUCCESS: + raise DragonMemoryError(err, "Could not retrieve the pool for the memory allocation.") + + err = dragon_memory_pool_serialize(&ser_pool, &pool_descr) + if err != DRAGON_SUCCESS: + raise DragonPoolError(err, "Could not serialize pool") + + # Returns a python copy of the serializer + return MemoryPool.attach(ser_pool.data[:ser_pool.len]) + cdef class MemoryAllocations: """ @@ -262,7 +368,8 @@ cdef class MemoryAllocations: cdef dragonMemoryPoolAllocations_t allocs def __del__(self): - dragon_memory_pool_allocations_destroy(&self.allocs) + with nogil: + dragon_memory_pool_allocations_destroy(&self.allocs) # @MCB Note: Cython gets really mad if we try to pass in C structs to __cinit__, so this will # do for now @@ -318,10 +425,10 @@ cdef class MemoryPool: self._serialized = 0 def __getstate__(self): - return (self.serialize(),) + return (self.serialize(), self._muid) def __setstate__(self, state): - (serialized_bytes,) = state + (serialized_bytes, self._muid) = state self.attach(serialized_bytes, existing_memory_pool=self) def __del__(self): @@ -354,6 +461,8 @@ cdef class MemoryPool: if not isinstance(uid, int): raise TypeError(f"Pool uid must be int, got type {type(uid)}") + self._muid = uid + derr = dragon_memory_attr_init(&self._mattr) if derr != DRAGON_SUCCESS: raise RuntimeError(f"MemoryAttr Error: Unable to initialized memory attribute. Dragon Error Code: ({dragon_get_rc_string(derr)})") @@ -418,6 +527,7 @@ cdef class MemoryPool: dragonMemoryPoolSerial_t _ser const unsigned char[:] cdata = pool_ser MemoryPool mpool + dragonULInt the_muid if existing_memory_pool is None: mpool = cls.__new__(cls) # Create an empty instance of MemoryPool @@ -436,6 +546,12 @@ cdef class MemoryPool: if derr != DRAGON_SUCCESS: raise DragonPoolAttachFail(derr, "Could not attach to serialized pool") + derr = dragon_memory_pool_muid(&mpool._pool_hdl, &the_muid) + if derr != DRAGON_SUCCESS: + raise DragonPoolError(derr, "Could not retrieve muid of pool.") + + mpool._muid = the_muid + return mpool def destroy(self): @@ -444,7 +560,9 @@ cdef class MemoryPool: """ cdef dragonError_t derr - derr = dragon_memory_pool_destroy(&self._pool_hdl) + with nogil: + derr = dragon_memory_pool_destroy(&self._pool_hdl) + if derr != DRAGON_SUCCESS: raise DragonPoolError(derr, "Could not destroy pool") @@ -496,6 +614,7 @@ cdef class MemoryPool: cdef: dragonError_t derr dragonMemoryDescr_t mem + size_t sz if not isinstance(size, int): raise TypeError(f"Allocation size must be int, got type {type(size)}") @@ -504,7 +623,10 @@ cdef class MemoryPool: if size < 1: raise RuntimeError("Size cannot be less than 1 for memory allocations") - derr = dragon_memory_alloc(&mem, &self._pool_hdl, size) + sz = size + + with nogil: + derr = dragon_memory_alloc(&mem, &self._pool_hdl, sz) if derr == DRAGON_DYNHEAP_REQUESTED_SIZE_NOT_AVAILABLE: raise DragonPoolAllocationNotAvailable(derr, f"An allocation of size={size} is not available.") @@ -587,7 +709,8 @@ cdef class MemoryPool: dragonError_t derr dragonMemoryPoolAllocations_t allocs - derr = dragon_memory_pool_get_allocations(&self._pool_hdl, &allocs) + with nogil: + derr = dragon_memory_pool_get_allocations(&self._pool_hdl, &allocs) if derr != DRAGON_SUCCESS: raise DragonPoolError(derr, "Could not retrieve allocation list from pool") @@ -604,9 +727,15 @@ cdef class MemoryPool: cdef: dragonError_t derr int flag + dragonMemoryAllocationType_t tyval + dragonULInt alid + + tyval = alloc_type.value + alid = alloc_id - derr = dragon_memory_pool_allocation_exists(&self._pool_hdl, alloc_type.value, - alloc_id, &flag) + with nogil: + derr = dragon_memory_pool_allocation_exists(&self._pool_hdl, tyval, + alid, &flag) if derr != DRAGON_SUCCESS: raise DragonPoolError(derr, "Error checking allocation existence") @@ -625,9 +754,15 @@ cdef class MemoryPool: dragonError_t derr dragonMemoryDescr_t mem_descr size_t mem_size + dragonMemoryAllocationType_t tyval + dragonULInt alid + + tyval = alloc_type.value + alid = alloc_id - derr = dragon_memory_get_alloc_memdescr(&mem_descr, &self._pool_hdl, - alloc_type.value, alloc_id, 0, NULL) + with nogil: + derr = dragon_memory_get_alloc_memdescr(&mem_descr, &self._pool_hdl, + tyval, alid, 0, NULL) if derr != DRAGON_SUCCESS: raise DragonPoolError(derr, "Could not retrieve memory descriptor of provided ID and Type") @@ -646,3 +781,58 @@ cdef class MemoryPool: def is_local(self): return dragon_memory_pool_is_local(&self._pool_hdl) + @property + def muid(self): + return self._muid + + @property + def free_space(self): + cdef: + dragonError_t derr + uint64_t sz + + with nogil: + derr = dragon_memory_pool_get_free_size(&self._pool_hdl, &sz) + if derr != DRAGON_SUCCESS: + raise DragonPoolError(derr, "Could not retrieve the pool's free space.") + + return sz + + @property + def size(self): + cdef: + dragonError_t derr + uint64_t sz + + with nogil: + derr = dragon_memory_pool_get_total_size(&self._pool_hdl, &sz) + if derr != DRAGON_SUCCESS: + raise DragonPoolError(derr, "Could not retrieve the pool's total size.") + + return sz + + @property + def utilization(self): + cdef: + dragonError_t derr + double pct + + with nogil: + derr = dragon_memory_pool_get_utilization_pct(&self._pool_hdl, &pct) + if derr != DRAGON_SUCCESS: + raise DragonPoolError(derr, "Could not retrieve the pool's percent utilized space.") + + return pct + + @property + def rt_uid(self): + cdef: + dragonError_t derr + dragonULInt rtuid + + with nogil: + derr = dragon_memory_pool_get_rt_uid(&self._pool_hdl, &rtuid) + if derr != DRAGON_SUCCESS: + raise DragonPoolError(derr, "Error getting pool's rt_uid") + + return rtuid diff --git a/src/dragon/pydragon_utils.pyx b/src/dragon/pydragon_utils.pyx index e426555..be962d3 100644 --- a/src/dragon/pydragon_utils.pyx +++ b/src/dragon/pydragon_utils.pyx @@ -1,5 +1,6 @@ from dragon.dtypes_inc cimport * from dragon.return_codes cimport * +#import cython cpdef host_id(): return dragon_host_id() @@ -8,6 +9,9 @@ cpdef set_host_id(int new_id): dnew_id = new_id dragon_set_host_id(new_id) +cpdef get_local_rt_uid(): + return dragon_get_local_rt_uid() + cpdef set_procname(str name): estr = name.encode('utf-8') cdef dragonError_t err = dragon_set_procname(estr) @@ -18,48 +22,45 @@ cdef class B64: """ Cython wrapper for Dragon's byte <> string conversion routines. """ cdef: char* _encoded_string - size_t _length def __cinit__(self): self._encoded_string = NULL - self._length = 0 def __del__(self): - if self._length > 0: + if self._encoded_string != NULL: free(self._encoded_string) self._encoded_string = NULL - self._length = 0 def __init__(self, data): """Convert a bytes array into a base64 encoded string. - :param data: The list of bytes to convert. :return: A new B64String object containing the base64 encoded string. """ - self._encoded_string = dragon_base64_encode(data, len(data), &self._length) + self._encoded_string = dragon_base64_encode(data, len(data)) def __str__(self): - return self._encoded_string[:self._length].decode('utf-8') + return self._encoded_string[:strlen(self._encoded_string)].decode('utf-8') def _initialize_from_str(self, serialized_str): - self._length = 0 - cdef char * todataptr = malloc(len(serialized_str)) + cdef char * todataptr = malloc(len(serialized_str)+1) if todataptr == NULL: raise ValueError('Could not allocate space for B64 object.') data = serialized_str.encode('utf-8') - size = len(serialized_str) + size = len(serialized_str)+1 cdef char * fromdataptr = data memcpy(todataptr, fromdataptr, size) self._encoded_string = todataptr - self._length = size def decode(self): cdef: uint8_t *data size_t decoded_length - data = dragon_base64_decode(self._encoded_string, self._length, &decoded_length) + data = dragon_base64_decode(self._encoded_string, &decoded_length) + if data == NULL: + raise ValueError('Invalid Base64 Value') + val = data[:decoded_length] free(data) return val @@ -73,10 +74,8 @@ cdef class B64: @classmethod def bytes_to_str(cls, the_bytes): """Converts bytes into a string by base64 encoding it. - Convenience function to convert bytes objects to base64 encoded strings. - :param the_bytes: bytes to get encoded :return: string """ @@ -85,10 +84,100 @@ cdef class B64: @classmethod def str_to_bytes(cls, the_str): """Converts a base64 encoded string to a bytes object. - Convenience function to unpack strings. - :param the_str: base64 encoded string. :return: original bytes representation. """ - return cls.from_str(the_str).decode() + data = cls.from_str(the_str).decode() + if data is None: + raise ValueError('Could not convert Base64 string to bytes.') + + return data + +cpdef b64encode(the_bytes): + return B64.bytes_to_str(the_bytes) + +cpdef b64decode(the_str): + return B64.str_to_bytes(the_str) + +cpdef hash(byte_str:bytes): + cdef: + const unsigned char[:] buf = byte_str + + return dragon_hash(&buf[0], len(byte_str)) + +cpdef set_local_kv(key, value, timeout=None): + cdef: + const unsigned char* val_ptr + unsigned char* empty_str = "" + const unsigned char[:] key_str + const unsigned char[:] val_str + + dragonError_t err + timespec_t * time_ptr + timespec_t val_timeout + + if len(key) == 0: + raise KeyError('Key cannot be empty') + + if timeout is None: + time_ptr = NULL + elif isinstance(timeout, int) or isinstance(timeout, float): + if timeout < 0: + raise ValueError('Cannot provide timeout < 0 to set_local_kv operation') + # Anything > 0 means use that as seconds for timeout. + time_ptr = & val_timeout + val_timeout.tv_sec = int(timeout) + val_timeout.tv_nsec = int((timeout - val_timeout.tv_sec)*1000000000) + else: + raise ValueError('make_process_local timeout must be a float or int') + + key_str = str.encode(key) + + if len(value) > 0: + val_str = str.encode(value) + val_ptr = &val_str[0] + else: + val_ptr = empty_str + + with nogil: + err = dragon_ls_set_kv(&key_str[0], val_ptr, time_ptr) + + if err != DRAGON_SUCCESS: + raise RuntimeError(f'Could not set kv pair. EC=({dragon_get_rc_string(err)})\n ERR_MSG={dragon_getlasterrstr()}') + +cpdef get_local_kv(key, timeout=None): + cdef: + const unsigned char[:] key_str + char* val_str + + dragonError_t err + timespec_t * time_ptr + timespec_t val_timeout + + if timeout is None: + time_ptr = NULL + elif isinstance(timeout, int) or isinstance(timeout, float): + if timeout < 0: + raise ValueError('Cannot provide timeout < 0 to set_local_kv operation') + # Anything > 0 means use that as seconds for timeout. + time_ptr = & val_timeout + val_timeout.tv_sec = int(timeout) + val_timeout.tv_nsec = int((timeout - val_timeout.tv_sec)*1000000000) + else: + raise ValueError('make_process_local timeout must be a float or int') + + key_str = str.encode(key) + + with nogil: + err = dragon_ls_get_kv(&key_str[0], &val_str, time_ptr) + + if err == DRAGON_NOT_FOUND: + raise KeyError(key) + + if err != DRAGON_SUCCESS: + raise RuntimeError(f'Could not set kv pair. EC=({dragon_get_rc_string(err)})\n ERR_MSG={dragon_getlasterrstr()}') + + return val_str.decode('utf-8') + + diff --git a/src/dragon/transport/oob/__init__.py b/src/dragon/transport/oob/__init__.py new file mode 100644 index 0000000..95b1545 --- /dev/null +++ b/src/dragon/transport/oob/__init__.py @@ -0,0 +1,127 @@ +import os +import subprocess +from ... import channels as dch +from ... import managed_memory as dmm +from ...globalservices.channel import create, release_refcnt +from ...localservices.options import ChannelOptions as ShepherdChannelOptions +from ...infrastructure.channel_desc import ChannelOptions +from ...infrastructure import connection as dconn +from ...infrastructure import facts as dfacts +from ...infrastructure import messages as dmsg +from ...infrastructure import parameters as dparms +from ...infrastructure import util as dutil +from ...launcher import util as dlutil +from ...utils import B64 + + +class OutOfBand: + + def __init__(self, log_sdesc=None): + self.connecting_ta = False + self.accepting_ta = True + self.ta_started = False + self.ta_input = None + self.tunnel_proc = None + self.port = None + self.channels = [] + self.log_sdesc = log_sdesc + + + # TODO: need to guarantee that this is channel local + def new_local_channel(self, capacity=128): + sh_channel_options = ShepherdChannelOptions(capacity) + gs_channel_options = ChannelOptions(ref_count=True, local_opts=sh_channel_options) + + m_uid = dfacts.default_pool_muid_from_index(dparms.this_process.index) + descriptor = create(m_uid, options=gs_channel_options) + ch = dch.Channel.attach(descriptor.sdesc) + self.channels.append(ch) + + return ch + + + def setup_gateways(self, env, fe_ext_ip_addr, head_node_ip_addr): + # create a gateway channel and associate it with the remote runtime + gw_ch = self.new_local_channel() + gw_str = B64.bytes_to_str(gw_ch.serialize()) + remote_rt_uid = dutil.rt_uid_from_ip_addrs(fe_ext_ip_addr, head_node_ip_addr) + + env['DRAGON_REMOTE_RT_UID'] = str(remote_rt_uid) + env[f'DRAGON_RT_UID__{remote_rt_uid}'] = gw_str + os.environ[f'DRAGON_RT_UID__{remote_rt_uid}'] = gw_str + + + def start_oob_transport(self, fe_ext_ip_addr=None, head_node_ip_addr=None, port=None): + # save port + self.port = port + + # create tcp agents input and output channels + output_ch = self.new_local_channel() + input_ch = self.new_local_channel() + self.ta_input = dconn.Connection(outbound_initializer=input_ch, policy=dparms.POLICY_INFRASTRUCTURE) + + env = dict(os.environ) + + args = [ + 'python3', '-m', 'dragon.cli', 'dragon-tcp', + f'--oob-port={port}', + '--no-tls', '--no-tls-verify', + f'--ch-in-sdesc={B64.bytes_to_str(input_ch.serialize())}', + f'--ch-out-sdesc={B64.bytes_to_str(output_ch.serialize())}' + ] + + if self.log_sdesc != None: + args.append(f'--log-sdesc={self.log_sdesc}') + args.append(f'--dragon-logging=true') + args.append(f'--log-level=DEBUG') + else: + args.append(f'--no-dragon-logging') + + # this should only be true on the connecting side + if head_node_ip_addr != None: + self.setup_gateways(env, fe_ext_ip_addr, head_node_ip_addr) + oob_ip_addr = head_node_ip_addr + args.append(f'--oob-ip-addr={oob_ip_addr}') + + # this sets an arbitrary value for node_index + args.append(str(0)) + + subprocess.Popen(args, env=env) + + + def connect(self, fe_ext_ip_addr, head_node_ip_addr, port): + self.connecting_ta = True + if not self.ta_started: + tunnel_args = ['ssh', '-J', f'{fe_ext_ip_addr}', '-N', '-L', f'{port}:localhost:{port}', '-f', f'{head_node_ip_addr}', '>', '/dev/null', '2>&1'] + self.tunnel_proc = subprocess.Popen(tunnel_args) + self.start_oob_transport(fe_ext_ip_addr=fe_ext_ip_addr, head_node_ip_addr=head_node_ip_addr, port=port) + self.ta_started = True + + + def accept(self, port): + self.accepting_ta = True + if not self.ta_started: + self.start_oob_transport(port=port) + self.ta_started = True + + + def __del__(self): + # kill the (local) ssh tunnel + if self.tunnel_proc is not None: + self.tunnel_proc.terminate() + + # halt the OOB transport agents + if self.ta_started: + halt_msg = dmsg.UserHaltOOB(tag=dlutil.next_tag()) + self.ta_input.send(halt_msg.serialize()) + + # clean up input/output channels used for OOB transport agents + for ch in self.channels: + release_refcnt(ch.cuid) + ch.detach() + + # kill process that accepts incoming tcp connections + #TODO: Improve this. We should know which process is being killed + # and kill it explicitly. + if self.accepting_ta: + os.system(f'kill $(lsof -t -i:{self.port}) > /dev/null 2>&1') diff --git a/src/dragon/transport/tcp/__main__.py b/src/dragon/transport/tcp/__main__.py index b77a335..d6678ef 100644 --- a/src/dragon/transport/tcp/__main__.py +++ b/src/dragon/transport/tcp/__main__.py @@ -1,10 +1,12 @@ import asyncio import logging import os +import socket import ssl +from collections import defaultdict from typing import Union -from ...channels import Channel +from ...channels import Channel, register_gateways_from_env from ...infrastructure import messages as dmsg from ...infrastructure.connection import Connection, ConnectionOptions from ...infrastructure.facts import GW_ENV_PREFIX, DEFAULT_TRANSPORT_PORT, FRONTEND_HOSTID @@ -30,6 +32,11 @@ LOGGER = logging.getLogger('dragon.transport.tcp.__main__') +user_initiated = False +infrastructure = False +out_of_band_connect = False +out_of_band_accept = False + def single_recv(ch_sdesc: bytes) -> Union[tuple(dmsg.all_message_classes)]: ch = Channel.attach(ch_sdesc) @@ -78,9 +85,16 @@ async def tcp_transport_agent(node_index: str = None, max_threads: int = None, host_ids: list[str] = None, ip_addrs: list[str] = None, - infrastructure: bool = False, + oob_ip_addr: str = None, + oob_port: str = None, frontend: bool = False) -> None: + # TODO: get rid of globals from proxy-api work (when possible) + global user_initiated + global infrastructure + global out_of_band_connect + global out_of_band_accept + from ...dlogging.util import DragonLoggingServices if infrastructure: LOGGER = logging.getLogger(DragonLoggingServices.ON).getChild('transport.tcp.__main__') @@ -90,11 +104,13 @@ async def tcp_transport_agent(node_index: str = None, else: up_msg = dmsg.OverlayPingBE(next_tag()) halt_msg = dmsg.BEHaltOverlay - - else: + elif user_initiated: LOGGER = logging.getLogger(DragonLoggingServices.TA).getChild('transport.tcp.__main__') up_msg = dmsg.TAPingSH(next_tag()) halt_msg = dmsg.SHHaltTA + else: + LOGGER = logging.getLogger(DragonLoggingServices.OOB).getChild('transport.tcp.__main__') + halt_msg = dmsg.UserHaltOOB if max_threads is not None: # Set a new default executor that limits the maximum number of worker @@ -108,7 +124,7 @@ async def tcp_transport_agent(node_index: str = None, loop.set_default_executor(executor) # Initial receive from input channel to get LAChannelsInfo message - if not infrastructure: + if user_initiated: la_channels_info = await asyncio.to_thread(single_recv, ch_in_sdesc.decode()) assert isinstance(la_channels_info, dmsg.LAChannelsInfo), "Did not receive LAChannelsInfo from local services" @@ -136,10 +152,18 @@ async def tcp_transport_agent(node_index: str = None, except Exception: LOGGER.critical(f'Failed to build node-address mapping from LAChannelsInfo.nodes_desc: {la_channels_info.nodes_desc}') raise - - # If infrastructure, create same data structures as above, but use information input - # since there's no LAChannelsInfo for us + elif out_of_band_connect: + # If this agent is connecting to another dragon instance to allow out-of-band + # communication, then all host_ids should map to the same target address + addr = Address.from_netloc(f'{oob_ip_addr}:{oob_port}') + nodes = defaultdict(lambda: addr) + elif out_of_band_accept: + # If this agent is accepting connections from remote dragon instances to allow + # out-of-band communication, then it can have an empty nodes dict + nodes = {} else: + # If infrastructure, create same data structures as above, but use information input + # since there's no LAChannelsInfo for us nodes = {} for ip_addr, host_id in zip(ip_addrs, host_ids): try: @@ -154,24 +178,35 @@ async def tcp_transport_agent(node_index: str = None, try: # Establish connection for command-and-control - if not infrastructure: + if user_initiated: ch_out_sdesc = B64.from_str(node_desc.shep_cd) control = await asyncio.to_thread(open_connection, ch_in_sdesc.decode(), ch_out_sdesc.decode()) except Exception: - if not infrastructure: + if user_initiated: LOGGER.critical(f'Failed to initialize the control connection: Requires valid input and output channel descriptors for node index {node_index}: input_channel={ch_in_sdesc}, output_channel={ch_out_sdesc}, LAChannelsInfo={la_channels_info.get_sdict()}') - else: + elif infrastructure: LOGGER.critical(f'Failed to initialize the control connection: Requires valid input and output channel descriptors: input_channel={ch_in_sdesc}, output_channel={ch_out_sdesc}') raise try: # Create transport - if not infrastructure: + if user_initiated: transport = StreamTransport(nodes[int(node_desc.host_id)]) wait_mode = DEFAULT_WAIT_MODE - else: + elif infrastructure: transport = StreamTransport(nodes[int(get_host_id())]) wait_mode = IDLE_WAIT + else: + hostname = socket.gethostname() + ip_addr = socket.gethostbyname(hostname) + local_addr = Address.from_netloc(f'{ip_addr}:{oob_port}') + transport = StreamTransport(local_addr) + if out_of_band_connect: + transport._oob_connect = True + else: + transport._oob_accept = True + wait_mode = IDLE_WAIT + LOGGER.info(f'Created transport: {transport.addr} with wait mode {wait_mode}') if tls_enabled: @@ -196,11 +231,19 @@ async def tcp_transport_agent(node_index: str = None, async with Agent(transport, nodes, wait_mode=wait_mode) as agent: LOGGER.info('Created agent') + n_gw = 0 + # Create clients for each gateway channel - if not infrastructure: + if user_initiated: n_gw = la_channels_info.num_gw_channels - else: + elif infrastructure: n_gw = DRAGON_OVERLAY_DEFAULT_NUM_GW_CHANNELS_PER_NODE + elif out_of_band_connect: + rt_uid = os.environ['DRAGON_REMOTE_RT_UID'] + encoded_channel_sdesc = os.environ[f'DRAGON_RT_UID__{rt_uid}'] + channel_sdesc = B64.from_str(encoded_channel_sdesc).decode() + agent.new_client(channel_sdesc) + LOGGER.debug(f'Created client for OOB gateway channel') for i in range(n_gw): encoded_channel_sdesc = os.environ[GW_ENV_PREFIX + str(i+1)] @@ -210,8 +253,9 @@ async def tcp_transport_agent(node_index: str = None, # Send TAPingSH to local services to acknowledge transport is active # XXX What is tag? - await asyncio.to_thread(control.send, up_msg.serialize()) - LOGGER.info(f'Sent {type(up_msg)} reply') + if user_initiated or infrastructure: + await asyncio.to_thread(control.send, up_msg.serialize()) + LOGGER.info(f'Sent {type(up_msg)} reply') while agent.is_running(): LOGGER.debug('Agent is running, polling control') @@ -252,7 +296,7 @@ def main(args=None): from distutils.util import strtobool from ...dlogging.util import DragonLoggingServices, setup_BE_logging - from ...infrastructure.facts import PROCNAME_OVERLAY_TA, PROCNAME_TCP_TA + from ...infrastructure.facts import PROCNAME_OVERLAY_TA, PROCNAME_TCP_TA, PROCNAME_OOB_TA from ...infrastructure.util import range_expr from ...utils import set_procname @@ -262,6 +306,8 @@ def main(args=None): help="Set to use for runtime infrastructure rather than backend communiction") parser.add_argument('--ip-addrs', metavar='FRONTEND_IP', dest='ip_addrs', nargs='+', type=str, help=FRONTEND_HELP) + parser.add_argument('--oob-ip-addr', type=str, help="Target IP address for out-of-band communication") + parser.add_argument('--oob-port', type=str, help="Listening port at target for out-of-band communication") parser.add_argument('--ch-in-sdesc', type=B64.from_str, help="Base64 encoded serialized input channel descriptor") @@ -344,19 +390,43 @@ def get_log_level(level): args = parser.parse_args() + global user_initiated + global infrastructure + global out_of_band_connect + global out_of_band_accept + + if args.oob_ip_addr != None: + out_of_band_connect = True + elif args.oob_port != None: + out_of_band_accept = True + elif args.infrastructure: + infrastructure = True + else: + user_initiated = True + if args.frontend: set_host_id(FRONTEND_HOSTID) - if not args.infrastructure: + if user_initiated: set_procname(PROCNAME_TCP_TA) - else: + elif infrastructure: set_procname(PROCNAME_OVERLAY_TA) + else: + set_procname(PROCNAME_OOB_TA) + + # In the OOB accept case, the sendmsg/getmsg/poll used for the local + # channel operation can (and frequently will) target an off-node channel + if out_of_band_accept: + register_gateways_from_env() if args.dragon_logging and args.log_sdesc is not None: - if not args.infrastructure: + if user_initiated: service = DragonLoggingServices.TA - else: + elif infrastructure: service = DragonLoggingServices.ON + else: + service = DragonLoggingServices.OOB + log_level, _ = setup_BE_logging(service=service, logger_sdesc=args.log_sdesc) # We will ignore the argument level because we want a unified setting @@ -391,7 +461,8 @@ def get_log_level(level): args.max_threads, args.host_ids, args.ip_addrs, - args.infrastructure, + args.oob_ip_addr, + args.oob_port, args.frontend )) except Exception: @@ -399,13 +470,14 @@ def get_log_level(level): raise # Send exit control message - if args.infrastructure: - down_msg = dmsg.OverlayHalted(tag=next_tag()) - else: + if user_initiated: down_msg = dmsg.TAHalted(tag=next_tag()) + elif infrastructure: + down_msg = dmsg.OverlayHalted(tag=next_tag()) try: - control.send(down_msg.serialize()) + if user_initiated or infrastructure: + control.send(down_msg.serialize()) except Exception: LOGGER.exception('Failed to send TAHalted') raise diff --git a/src/dragon/transport/tcp/client.py b/src/dragon/transport/tcp/client.py index 31d9230..b9ca4e9 100644 --- a/src/dragon/transport/tcp/client.py +++ b/src/dragon/transport/tcp/client.py @@ -195,7 +195,7 @@ async def _(self, resp: RecvResponse, addr: Address, msg: GatewayMessage) -> Non """Handle RecvResponse messages.""" assert msg.is_get_kind try: - msg_recv = await asyncio.to_thread(create_msg, resp.payload, self._channel, msg.deadline, msg.get_dest_mem_descr_ser) + msg_recv = await asyncio.to_thread(create_msg, resp.payload, resp.clientid, resp.hints, self._channel, msg.deadline, msg.get_dest_mem_descr_ser) except BaseException as e: try: msg.complete_error(get_errno(e)) @@ -224,12 +224,14 @@ def create_request(msg: GatewayMessage) -> Request: else: raise ValueError('Unsupported send return mode') sendhid = UUIDBytesIO.decode(msg.send_payload_message_attr_sendhid) + clientid = msg.send_payload_message_attr_clientid + hints = msg.send_payload_message_attr_hints payload = msg.send_payload_message mem_sd = msg.send_dest_mem_descr_ser if mem_sd is None: - cls = partial(SendRequest, return_mode=send_return_mode, sendhid=sendhid, payload=payload) + cls = partial(SendRequest, return_mode=send_return_mode, sendhid=sendhid, clientid=clientid, hints=hints, payload=payload) else: - cls = partial(SendMemoryRequest, return_mode=send_return_mode, sendhid=sendhid, payload=payload, mem_sd=mem_sd) + cls = partial(SendMemoryRequest, return_mode=send_return_mode, sendhid=sendhid, clientid=clientid, hints=hints, payload=payload, mem_sd=mem_sd) elif msg.is_get_kind: cls = RecvRequest elif msg.is_event_kind: diff --git a/src/dragon/transport/tcp/messages.py b/src/dragon/transport/tcp/messages.py index d05f13d..6417512 100644 --- a/src/dragon/transport/tcp/messages.py +++ b/src/dragon/transport/tcp/messages.py @@ -261,6 +261,16 @@ class SendRequest(Request, typeid=b'\x01'): concurrency is limited based on target channel and the send handle ID. """ + clientid: uint64 + """The ``clientid`` attribute is a user supplied + attribute of a message that does not affect the payload. + """ + + hints: uint64 + """The ``hints`` attribute is a user supplied attribute + of a message that does not affect the payload. + """ + payload: varbytes """Message payload.""" @@ -324,6 +334,16 @@ class SendResponse(Response, typeid=b'\xfe'): class RecvResponse(Response, typeid=b'\xfc'): """Response to a `RecvRequest`.""" + clientid: uint64 + """The ``clientid`` attribute is a user supplied + attribute of a message that does not affect the payload. + """ + + hints: uint64 + """The ``hints`` attribute is a user supplied attribute + of a message that does not affect the payload. + """ + payload: varbytes """Message payload.""" diff --git a/src/dragon/transport/tcp/server.py b/src/dragon/transport/tcp/server.py index 3a84560..b8aa15c 100644 --- a/src/dragon/transport/tcp/server.py +++ b/src/dragon/transport/tcp/server.py @@ -115,6 +115,8 @@ async def _(self, req: SendRequest, addr: Address) -> None: await asyncio.to_thread( send_msg, req.channel_sd, + req.clientid, + req.hints, req.payload, req.deadline, getattr(req, 'mem_sd', None), @@ -127,33 +129,15 @@ async def _(self, req: SendRequest, addr: Address) -> None: @handle_request.register async def _(self, req: RecvRequest, addr: Address) -> None: - msg = await asyncio.to_thread(recv_msg, req.channel_sd, req.deadline, wait_mode=self._wait_mode) + clientid, hints, msg_bytes = await asyncio.to_thread(recv_msg, req.channel_sd, req.deadline, wait_mode=self._wait_mode) task = None # Ensure task is defined for use in exception handler - destroy_callback = lambda t: msg.destroy() try: - # Create the response. Note the use of a memory view into the - # message payload! This means we need to take special care to not - # destroy the message until AFTER the response has been sent. - resp = RecvResponse(req.seqno, msg.bytes_memview()) - # Create a background task to destroy the message after the - # response is sent. - task = asyncio.create_task(resp._io_event.wait()) - self._background_tasks.add(task) - task.add_done_callback(self._background_tasks.discard) - # Add a callback to destroy the received message after it has been - # sent. - task.add_done_callback(destroy_callback) + # Create the response. Note the use of a bytearray for the + # response payload. + resp = RecvResponse(req.seqno, clientid, hints, msg_bytes) # Write the response self.transport.write_response(resp, addr) except: - # Remove the destroy_callback since we're going to immediately - # destroy it. - try: - if isinstance(task, asyncio.Task): - task.remove_done_callback(destroy_callback) - except: - pass - msg.destroy() raise @handle_request.register diff --git a/src/dragon/transport/tcp/transport.py b/src/dragon/transport/tcp/transport.py index 136e4f9..05aaad2 100644 --- a/src/dragon/transport/tcp/transport.py +++ b/src/dragon/transport/tcp/transport.py @@ -375,6 +375,8 @@ def __init__(self, addr: Address): self._recv_tasks = defaultdict(WeakSet) self._mailboxes = defaultdict(asyncio.Queue) self._send_tasks = WeakValueDictionary() + self._oob_connect = False + self._oob_accept = False @run_forever async def run(self): @@ -382,7 +384,10 @@ async def run(self): Typically ran in a separate `asyncio.Task` via `StreamTransport.start`. """ - self._server = await asyncio.start_server(self.accept_connection, str(self.addr.host), int(self.addr.port), **self.server_options) + if self._oob_accept: + self._server = await asyncio.start_server(self.accept_connection, 'localhost', int(self.addr.port), **self.server_options) + else: + self._server = await asyncio.start_server(self.accept_connection, str(self.addr.host), int(self.addr.port), **self.server_options) try: async with self._server: await self._server.serve_forever() @@ -467,8 +472,13 @@ async def _open_connection(self, addr: Address, /) -> tuple[asyncio.StreamReader :return: `asyncio.StreamReader` and `asyncio.StreamWriter` pair """ opts = self.default_connection_options.new_child(self.connection_options[addr]) - reader, writer = await asyncio.open_connection(str(addr.host), int(addr.port), **opts) - addr = await self.addr.do_handshake(reader, writer) + if self._oob_connect or self._oob_accept: + reader, writer = await asyncio.open_connection('localhost', int(addr.port), **opts) + await self.addr.do_handshake(reader, writer) + else: + reader, writer = await asyncio.open_connection(str(addr.host), int(addr.port), **opts) + addr = await self.addr.do_handshake(reader, writer) + # XXX See comment above in accept_connection() on why this is not a # XXX tenable workaround for actual server authentication. ## Verify connection is to the advertised address diff --git a/src/dragon/transport/tcp/util.py b/src/dragon/transport/tcp/util.py index db622ba..4e196d4 100644 --- a/src/dragon/transport/tcp/util.py +++ b/src/dragon/transport/tcp/util.py @@ -9,7 +9,6 @@ from ...managed_memory import MemoryAlloc, MemoryPool from ...dtypes import WaitMode, DEFAULT_WAIT_MODE - def unget_nowait(self: Queue, item: Any) -> None: """Re-queues an item at the front of a queue, essentially performing the opposite of get_nowait(). @@ -53,7 +52,7 @@ def seconds_remaining(deadline: float, _inf: Optional[float] = None) -> (Optiona return remaining, since -def mem_descr_msg(sdesc: bytes, data: bytes) -> Message: +def mem_descr_msg(sdesc: bytes, data: bytes, clientid: int, hints: int) -> Message: """Attaches to memory allocation given a serialized memory descriptor and writes the specified data. Yields a Dragon message created from the corresponding memory allocation. @@ -66,13 +65,13 @@ def mem_descr_msg(sdesc: bytes, data: bytes) -> Message: raise ValueError(f'Memory allocation too small: {mem}') v = mem.get_memview() v[:] = data - return Message.create_from_mem(mem) + return Message.create_from_mem(mem, clientid, hints) except: mem.detach() raise -def mem_pool_msg(pool: MemoryPool, data: bytes, deadline: Optional[float] = None) -> Message: +def mem_pool_msg(pool: MemoryPool, data: bytes, clientid: int, hints: int, deadline: Optional[float] = None) -> Message: """Creates Dragon message from data in specific pool's memory allocation. Deadline is measured relative to time.monotonic(). @@ -80,7 +79,7 @@ def mem_pool_msg(pool: MemoryPool, data: bytes, deadline: Optional[float] = None # XXX ChannelSendH.send_bytes() does not call Message.create_alloc() with # XXX a timeout or deadline; hence it is re-implemented here. timeout, _ = seconds_remaining(deadline) - msg = Message.create_alloc(pool, len(data), timeout=timeout) + msg = Message.create_alloc(pool, len(data), clientid=clientid, hints=hints, timeout=timeout) try: v = msg.bytes_memview() v[:] = data @@ -90,7 +89,7 @@ def mem_pool_msg(pool: MemoryPool, data: bytes, deadline: Optional[float] = None return msg -def create_msg(data: bytes, channel: Optional[Channel] = None, deadline: Optional[float] = None, sdesc: Optional[bytes] = None) -> Message: +def create_msg(data: bytes, clientid: int, hints: int, channel: Optional[Channel] = None, deadline: Optional[float] = None, sdesc: Optional[bytes] = None) -> Message: """Creates a Dragon message from data. If a channel is specified, the message will be created from a memory @@ -103,9 +102,9 @@ def create_msg(data: bytes, channel: Optional[Channel] = None, deadline: Optiona """ if sdesc is None: assert channel is not None - return mem_pool_msg(channel.get_pool(), data, deadline) + return mem_pool_msg(channel.default_alloc_pool, data, clientid, hints, deadline) else: - return mem_descr_msg(sdesc, data) + return mem_descr_msg(sdesc, data, clientid, hints) @contextmanager @@ -127,6 +126,8 @@ def open_handle(h: Union[ChannelRecvH, ChannelSendH]) -> Generator[Union[Channel def send_msg(channel_sdesc: bytes, + clientid: int, + hints: int, payload: bytes, deadline: float, mem_sd: Optional[bytes] = None, @@ -134,7 +135,7 @@ def send_msg(channel_sdesc: bytes, wait_mode: WaitMode = DEFAULT_WAIT_MODE) -> None: # TODO Should we cache attached channels and open handles? with attach_channel(channel_sdesc) as ch, open_handle(ch.sendh(wait_mode=wait_mode)) as h: - msg = create_msg(payload, ch, deadline, mem_sd) + msg = create_msg(payload, clientid, hints, ch, deadline, mem_sd) timeout, _ = seconds_remaining(deadline) h.send( @@ -150,7 +151,12 @@ def recv_msg(channel_sdesc: bytes, # TODO Should we cache attached channels and open handles? with attach_channel(channel_sdesc) as ch, open_handle(ch.recvh(wait_mode=wait_mode)) as h: timeout, _ = seconds_remaining(deadline) - return h.recv(timeout=timeout) + msg = h.recv(timeout=timeout) + clientid = msg.clientid + hints = msg.hints + py_view = msg.tobytes() + msg.destroy() + return clientid, hints, py_view def poll_channel(channel_sdesc: bytes, event_mask: int, deadline: float) -> (bool, int): diff --git a/src/dragon/workflows/parsl_mpi_app.py b/src/dragon/workflows/parsl_mpi_app.py index b1baf7b..213d769 100644 --- a/src/dragon/workflows/parsl_mpi_app.py +++ b/src/dragon/workflows/parsl_mpi_app.py @@ -1,6 +1,6 @@ import dragon -from dragon.native.process import Process, TemplateProcess, Popen +from dragon.native.process import Process, ProcessTemplate, Popen from dragon.native.process_group import ProcessGroup import logging @@ -71,7 +71,7 @@ def submit(self, func, resource_specification, *args): # Pipe stdin and stdout from the head process to Dragon connections grp.add_process( nproc=1, - template=TemplateProcess( + template=ProcessTemplate( target=target_exe, args=mpi_args, cwd=run_dir, stdout=Popen.PIPE, stdin=Popen.PIPE ), ) @@ -79,7 +79,7 @@ def submit(self, func, resource_specification, *args): # All other ranks should have their output go to DEVNULL grp.add_process( nproc=num_ranks - 1, - template=TemplateProcess(target=target_exe, args=mpi_args, cwd=run_dir, stdout=Popen.DEVNULL), + template=ProcessTemplate(target=target_exe, args=mpi_args, cwd=run_dir, stdout=Popen.DEVNULL), ) grp.init() grp.start() diff --git a/src/dragon/workflows/runtime.py b/src/dragon/workflows/runtime.py new file mode 100644 index 0000000..4d4c84b --- /dev/null +++ b/src/dragon/workflows/runtime.py @@ -0,0 +1,261 @@ +import atexit +import json +import os +import pathlib +import socket +import subprocess +import time + +import dragon.infrastructure.messages as dmsg +import dragon.infrastructure.facts as dfacts +import dragon.infrastructure.util as dutil +import dragon.globalservices.api_setup as api_setup +import dragon.transport.oob as doob +import dragon.utils as dutils + + +current_rt_uid = None +already_published = {} +sdesc_by_system_and_name = {} +runtime_table = {} +oob_net_list = [] +must_register_teardown = True + + +def proxy_teardown(): + + global already_published + for publish_path in already_published: + os.remove(publish_path) + + runtime_table.clear() + oob_net_list.clear() + + +def free_port(host, port): + + try: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((host, int(port))) + s.shutdown(socket.SHUT_WR) + s.close() + return False + except ConnectionRefusedError: + return True + + +def get_port(host, start_port, end_port): + + for port in range(start_port, end_port+1): + if free_port(host, port): + return port + + raise RuntimeError('No available ports') + + +def get_ip(): + + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(0) + try: + # doesn't even have to be reachable + s.connect(('10.254.254.254', 1)) + IP = s.getsockname()[0] + except Exception: + IP = '127.0.0.1' + finally: + s.close() + return IP + + +def get_current_inf_env(): + + rv = {} + gs_cd_key = dfacts.env_name('GS_CD') + gs_ret_cd_key = dfacts.env_name('GS_RET_CD') + ls_cd_key = dfacts.env_name('LOCAL_SHEP_CD') + ls_ret_cd_key = dfacts.env_name('SHEP_RET_CD') + rv[gs_cd_key] = os.environ[gs_cd_key] + rv[gs_ret_cd_key] = os.environ[gs_ret_cd_key] + rv[ls_cd_key] = os.environ[ls_cd_key] + rv[ls_ret_cd_key] = os.environ[ls_ret_cd_key] + # get runtime ip addrs + rt_uid_key = 'DRAGON_RT_UID' + rv[rt_uid_key] = os.environ[rt_uid_key] + return rv + +def set_inf_env(env): + + gs_cd_key = dfacts.env_name('GS_CD') + gs_ret_cd_key = dfacts.env_name('GS_RET_CD') + ls_cd_key = dfacts.env_name('LOCAL_SHEP_CD') + ls_ret_cd_key = dfacts.env_name('SHEP_RET_CD') + os.environ[gs_cd_key] = env[gs_cd_key] + os.environ[gs_ret_cd_key] = env[gs_ret_cd_key] + os.environ[ls_cd_key] = env[ls_cd_key] + os.environ[ls_ret_cd_key] = env[ls_ret_cd_key] + # set runtime ip addrs + rt_uid_key = 'DRAGON_RT_UID' + os.environ[rt_uid_key] = env[rt_uid_key] + + global current_rt_uid + current_rt_uid = env[rt_uid_key] + + api_setup.connect_to_infrastructure(force=True) + + +@property +def current_rt_uid(): + + global current_rt_uid + if current_rt_uid is None: + return dutils.get_local_rt_uid() + else: + return current_rt_uid + + +class Proxy: + + def __init__(self, sdesc, oob_net): + self._sdesc = sdesc + self._env = json.loads(sdesc.env) + self._original_env = None + self._oob_net = oob_net + + def __enter__(self): + # Reconfigure infrastructure connnections + set_inf_env(self._env) + + def __exit__(self, exc_type, exc_value, traceback): + # Restore the original infrastructure connections + set_inf_env(self._original_env) + + def enable(self): + self._original_env = get_current_inf_env() + set_inf_env(self._env) + + def disable(self): + set_inf_env(self._original_env) + + def get_env(self): + return self._env + + +def get_sdesc(): + + gs_cd = os.environ[dfacts.env_name(dfacts.GS_CD)] + gs_ret_cd = os.environ[dfacts.env_name(dfacts.GS_RET_CD)] + ls_cd = os.environ[dfacts.env_name(dfacts.LOCAL_SHEP_CD)] + ls_ret_cd = os.environ[dfacts.env_name(dfacts.SHEP_RET_CD)] + + fe_ext_ip_addr = os.environ['DRAGON_FE_EXTERNAL_IP_ADDR'] + head_node_ip_addr = os.environ['DRAGON_HEAD_NODE_IP_ADDR'] + oob_port = dfacts.OOB_PORT + sdesc = dmsg.RuntimeDesc(0, gs_cd, gs_ret_cd, ls_cd, ls_ret_cd, fe_ext_ip_addr, head_node_ip_addr, oob_port, os.environ) + return sdesc.serialize() + + +def publish(name): + + sdesc_str = get_sdesc() + + global already_published + if name in already_published: + return get_sdesc() + + home_dir = pathlib.Path.home() + dragon_dir = home_dir / '.dragon' + publish_path = f'{dragon_dir}/{name}' + pathlib.Path(dragon_dir).mkdir(parents=True, exist_ok=True) + + # if the file already exists, just read from it and return the sdesc + # (depending on the filesystem, the file can potentially be visible + # anywhere on the system, so this is a better, but more costly, check + # than using "already_published" above) + + if os.path.exists(publish_path): + with open(publish_path, 'r') as publish_file: + return publish_file.read() + + try: + with open(publish_path, 'w') as publish_file: + publish_file.write(sdesc_str) + + already_published[publish_path] = True + + global oob_net_list + oob_net = doob.OutOfBand() + oob_net.accept(dfacts.OOB_PORT) + oob_net_list.append(oob_net) + + global must_register_teardown + if must_register_teardown: + atexit.register(proxy_teardown) + must_register_teardown = False + except: + raise RuntimeError(f'publish failed: {name} has already been published') + + return sdesc_str + + +def lookup(system, name, timeout_in=None): + + global sdesc_by_system_and_name + + # if we've already looked up this system/name combination, just grab + # the sdesc from our local dict + if (system, name) in sdesc_by_system_and_name: + return sdesc_by_system_and_name[(system, name)] + + # loop until timeout trying to scp the sdesc file from the remote system, + # sleeping 1 seconds between each attempt + time_so_far = 0 + if timeout_in is None: + timeout = 1 + else: + timeout = timeout_in + + home_dir = pathlib.Path.home() + dragon_dir = home_dir / '.dragon' + publish_path = f'{dragon_dir}/{name}' + + while time_so_far < timeout: + rc = os.system(f'scp {system}:{publish_path} . > /dev/null 2>&1') + if rc == 0: + with open(name, 'r') as publish_file: + sdesc = publish_file.read() + sdesc_by_system_and_name[(system, name)] = sdesc + os.remove(name) + return sdesc + else: + # do ls on remote .dragon directory to force visibility of + # newly created files on the parallel file system + os.system(f'ssh {system} "ls {publish_path}" > /dev/null 2>&1') + time.sleep(1) + time_so_far += 1 + + raise RuntimeError(f'lookup failed: could not obtain serialized descriptor for {system=} and {name=}') + + +def attach(sdesc_str): + + sdesc = dmsg.parse(sdesc_str) + + jump_host = sdesc.fe_ext_ip_addr + compute_node = sdesc.head_node_ip_addr + tunnel_port = sdesc.oob_port + + oob_net = doob.OutOfBand() + oob_net.connect(jump_host, compute_node, str(tunnel_port)) + + global must_register_teardown + + if must_register_teardown: + atexit.register(proxy_teardown) + must_register_teardown = False + + proxy = Proxy(sdesc, oob_net) + remote_rt_uid = dutil.rt_uid_from_ip_addrs(sdesc.fe_ext_ip_addr, sdesc.head_node_ip_addr) + runtime_table[remote_rt_uid] = proxy + + return proxy diff --git a/src/include/Makefile b/src/include/Makefile index d79d08d..ea8f4be 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -13,6 +13,8 @@ HEADERS = dragon/bcast.h \ dragon/return_codes_map.h \ dragon/shared_lock.h \ dragon/utils.h \ + dragon/ddict.h \ + dragon/dictionary.hpp \ dragon/fli.h DISTFILES = $(addprefix $(INSTALL_DIR)/include/,$(HEADERS)) diff --git a/src/include/dragon/channels.h b/src/include/dragon/channels.h index 3314310..56395d5 100644 --- a/src/include/dragon/channels.h +++ b/src/include/dragon/channels.h @@ -228,7 +228,8 @@ typedef struct dragonChannelRecvAttr_st { * */ typedef struct dragonChannelDescr_st { - uint64_t _idx; + dragonRT_UID_t _rt_idx; + dragonC_UID_t _idx; } dragonChannelDescr_t; /** @@ -620,6 +621,9 @@ dragon_channel_gatewaymessage_transport_event_cmplt(dragonGatewayMessage_t* gmsg dragonError_t dragon_channel_gatewaymessage_client_event_cmplt(dragonGatewayMessage_t* gmsg, dragonULInt* event, const dragonWaitMode_t wait_mode); +dragonError_t +dragon_create_process_local_channel(dragonChannelDescr_t* ch, const timespec_t* timeout); + #ifdef __cplusplus } #endif diff --git a/src/include/dragon/ddict.h b/src/include/dragon/ddict.h new file mode 100644 index 0000000..022a435 --- /dev/null +++ b/src/include/dragon/ddict.h @@ -0,0 +1,431 @@ +/* + Copyright 2020, 2022 Hewlett Packard Enterprise Development LP +*/ +#ifndef HAVE_DRAGON_DDICT_H +#define HAVE_DRAGON_DDICT_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup ddict_structs API Structures + * + * The ddict API structures. + * @{ + */ + +/** + * @brief An opaque DDict descriptor object + * + * When a using a distributed dictionary from C, this serves + * as the handle to the dictionary. Attaching to a distributed + * dictionary intializes a dragonDDictDescr_t. +*/ +typedef struct dragonDDictDescr_st { + uint64_t _idx; +} dragonDDictDescr_t; + + +/** + * @brief An opaque handle to a request object + * + * This is used when doing any interaction with the distributed dictionary. + * Operations on the dictionary may involve multiple call, such as a put or a + * get operation, and this request descriptor helps maintain the state of the + * request and response to this request. +*/ +typedef struct dragonDDictRequestDescr_st { + uint64_t _idx; +} dragonDDictRequestDescr_t; + + +/** + * @brief A serialized DDict object + * + * A serialized DDict object can be passed to other processes which can then + * attach to the object. Attaching initializes a dragonDDictDescr_t handle to + * the distributed dictionary. +*/ +typedef struct dragonDDictSerial_st { + size_t len; /*!< The length of the serialized descriptor in bytes. */ + uint8_t * data; /* !< The serialized descriptor data to be shared. */ +} dragonDDictSerial_t; + + /** @} */ // end of ddict_structs group. + + +/** @defgroup ddict_lifecycle + * DDict Life Cycle Functions + * @{ + */ +/** + * @brief Serialize a DDict object for sharing with another process. When sharing + * an DDict object with another process you may use this function to create a + * shareable serialized descriptor. This creates a binary string which may not + * be ASCII compliant. Before sharing, if ASCII compliance is required, call a + * base64 encoder like the dragon_base64_encode found in dragon/utils.h before + * sharing and dragon_base64_decode before attaching from the other process. + * + * NOTE: You must call dragon_ddict_serial_free to free a serialized descriptor + * after calling this function to free the extra space allocated by this + * function once you are done with the serialized descriptor. + * + * @param obj is a valid DDict descriptor that has previously been created or attached. + * + * @param serial is a serialized descriptor that will be initialized with the correct + * byte count and serialized bytes for so it can be passed to another process. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. +**/ +dragonError_t +dragon_ddict_serialize(const dragonDDictDescr_t* obj, dragonDDictSerial_t* serial); + + +/** + * @brief Free the internal resources of a serialized DDict descriptor + * + * This frees internal structures of a serialized DDict descriptor. It does not + * destroy the DDict object itself. + * + * @param serial is a serialized DDict descriptor. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_ddict_serial_free(dragonDDictSerial_t* serial); + +/** + * @brief Attach to an DDict object using a base 64 encoded string + * + * Calling this attaches to a DDict object by using a base 64 encoded serialized + * DDict descriptor that was passed to this process. The serialized DDict + * descriptor must have been created by base 64 encoding a serialized DDict + * descriptor. + * + * @param b64_str is a pointer to the serialized base 64 encoded string. + * + * @param obj is a pointer to an DDict descriptor that will be initialized by + * this call. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the operation + * to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + * + * NOTES: This does the base64 decode and then calls the normal attach function. + */ +dragonError_t +dragon_ddict_attach_b64(char* b64_str, dragonDDictDescr_t* obj, const timespec_t* timeout); + +/** + * @brief Attach to an DDict object + * + * Calling this attaches to a DDict object by using a serialized DDict descriptor + * that was passed to this process. The serialized DDict descriptor must have + * been created using the dragon_ddict_serialize function. + * + * @param serial is a pointer to the serialized DDict descriptor. + * + * @param obj is a pointer to an DDict descriptor that will be initialized by + * this call. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the operation + * to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_ddict_attach(const dragonDDictSerial_t* serial, dragonDDictDescr_t* obj, const timespec_t* timeout); + +/** + * @brief Detach from a DDict object. + * + * All internal, process local resources are freed by making this call. + * + * @param obj is a descriptor and opaque handle to the DDict object. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the operation + * to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_ddict_detach(dragonDDictDescr_t* obj, const timespec_t* timeout); + +/** + * @brief Destroy a DDict object. + * + * The distributed dictionary is destroyed including the orchestrator, all + * managers, and their associated flis and channels. + * + * @param obj is a descriptor and opaque handle to the DDict object. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_ddict_destroy(dragonDDictDescr_t* obj, const timespec_t* timeout); + +/** @} */ // end of ddict_lifecycle group. + +/** @defgroup ddict_requests + * DDict Request Functions + * @{ + */ + +/** + * @brief Create a request descriptor for sending a request to the distributed + * dictionary and waiting for a response. + * + * All internal state of the connection to the distributed dictionary is + * maintained by this request object. Details of particular operations may be + * stored in the private data structure for this object but are not accessible + * directly by the user. The user uses associated API calls that use this + * object. Not every request requires a request object. Requests that are + * accomplished by one API call are not dependent on a request object. When a + * request object is required it will be evident in the API. + * + * @param obj is a pointer to an initialized distributed dicationary descriptor. + * + * @param req is a pointer to a request object that will be initialized by this call. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_ddict_create_request(dragonDDictDescr_t* obj, dragonDDictRequestDescr_t* req); + +/** + * @brief This finalizes a request by completing any operation that was still + * pending for this request. When a request object is required it will be + * indicated in the API. + * + * @param req is a valid request object. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ + dragonError_t + dragon_ddict_finalize_request(dragonDDictRequestDescr_t* req, const timespec_t* timeout); + +/** @} */ // end of ddict_requests group. + +/** @defgroup ddict_sendrecv + * DDict Send/Receive Functions + * @{ + */ + +/** + * @brief Use this to write either key or value data to the distributed dictionary. + * + * The client may call this multiple times to put the parts of a key or value to + * the distributed dictionary. Internally, all key writes are buffered so the + * key can then be used to determine where the data is to be placed in the + * distributed dictionary. All value writes are streamed immediately to the + * distributed dictionary. All Key writes must come first for a request, + * followed by value writes. Key writes are terminated by an API call to the + * actual operation that requires a key as part of its request. Value writes, + * for a put, follow the API call for the operation until the request is + * finalized. All clients use the same selection algorithm for data placement + * so data put by one client can be found by all other clients. + * + * @param req is an initialized request object. + * + * @param num_bytes is the number of bytes on this put request. There may be + * additional bytes sent using this call as well. + * + * @param bytes is a pointer to a byte array (continguous bytes) with num_bytes size. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ +dragonError_t +dragon_ddict_write_bytes(dragonDDictRequestDescr_t* req, size_t num_bytes, + uint8_t* bytes, const timespec_t* timeout); + +/** + * @brief Calling this waits to receive streamed data from a distributed + * dictionary manager. If all data has been read, then DRAGON_EOT will be + * returned as the return code. This should be called after a get operation + * has been performed by calling dragon_ddict_get. Note that before calling + * the get operation, the key should have been written using the + * dragon_ddict_write_bytes operation. + * + * @param req is a valid request object that has been used to initiate reading + * data from the distributed dictionary. For example, a key should have been + * written and dragon_ddict_get should have been called. + * + * @param requested_size is the number of requested bytes. The actual size will + * be equal to or less than the requested_size. + * + * @param received_size is a pointer to the number of bytes that have been read on + * the call (assuming DRAGON_SUCCESS was returned). + * + * @param bytes is a pointer pointer that will be initialized to the bytes that + * were read. The space is malloc'ed and should be freed by the user once the + * data has been processed. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ + dragonError_t + dragon_ddict_read_bytes(dragonDDictRequestDescr_t* req, size_t requested_size, + size_t* received_size, uint8_t** bytes, const timespec_t* timeout); + +/** + * @brief Receive streamed data. + * + * Calling this waits to receive streamed data from a distributed + * dictionary manager. If all data has been read, then DRAGON_EOT will be + * returned as the return code. This should be called after a get operation + * has been performed by calling dragon_ddict_get. Note that before calling + * the get operation, the key should have been written using the + * dragon_ddict_write_bytes operation. + * + * @param req is a valid request object that has been used to initiate reading + * data from the distributed dictionary. For example, a key should have been + * written and dragon_ddict_get should have been called. + * + * @param requested_size is the number of requested bytes. The actual size will + * be equal to or less than the requested_size. + * + * @param received_size is a pointer to the number of bytes that have been read on + * the call (assuming DRAGON_SUCCESS was returned). + * + * @param bytes is a pointer to valid space where the data should be placed. It must + * be at least requested_size in size. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ + dragonError_t + dragon_ddict_read_bytes_into(dragonDDictRequestDescr_t* req, size_t requested_size, + size_t* received_size, uint8_t* bytes, const timespec_t* timeout); + +/** + * @brief Receive streamed data. + * + * Calling this waits to receive streamed data from a distributed + * dictionary manager but instead of copying it into malloced memory, returns + * the underlying managed memory object to the user. If all data has been + * read, then DRAGON_EOT will be returned as the return code. This should be + * called after a get operation has been performed by calling + * dragon_ddict_get. Note that before calling the get operation, the key + * should have been written using the dragon_ddict_write_bytes operation. + * + * @param req is a valid request object that has been used to initiate reading + * data from the distributed dictionary. For example, a key should have been + * written and dragon_ddict_get should have been called. + * + * @param mem is a managed memory allocation containing the packet of streamed + * data. The size of the memory allocation is available as part of the object + * and the managed memory API provides a means to get a pointer to the data. + * The managed memory allocation should be freed using the managed memory API + * once it is no longer needed. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ + dragonError_t + dragon_ddict_read_mem(dragonDDictRequestDescr_t* req, dragonMemoryDescr_t* mem); + + +/** @} */ // end of fli_sendrecv group. + +/** @defgroup ddict_ops + * Distributed Dictionary Operations + * @{ + */ + +/** + * @brief Check to see if key exists in ddict + * + * @param req is a valid request object that has been used to initiate reading + * data from the distributed dictionary. For example, a key should have been + * written and dragon_ddict_get should have been called. + * + * @param requested_size is the number of requested bytes. The actual size will + * be equal to or less than the requested_size. + * + * @param received_size is a pointer to the number of bytes that have been read on + * the call (assuming DRAGON_SUCCESS was returned). + * + * @param bytes must be a valid pointer to space that will be initialized to the + * bytes that were read. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ + dragonError_t + dragon_ddict_contains(dragonDDictRequestDescr_t* req); + +/** + * @brief Calling this tells the ddict client to take the key already written via + * the dragon_ddict_write_bytes call(s) to be posted to the correct manager + * and wait for a response. + * + * @param req is a valid created request object. It must have already had a key + * written to it via the dragon_ddict_write_bytes call. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ + dragonError_t + dragon_ddict_get(dragonDDictRequestDescr_t* req, const timespec_t* timeout); + +/** + * @brief Calling this tells the ddict client to take the key already written via + * the dragon_ddict_write_bytes call(s) to be posted to the correct manager. + * The key must be written before calling put. All writes to this request, + * following the call to this function are written to the correct manager as + * the value for the put on the distributed dictionary manager. + * + * @param req is a valid created request object. It must have already had a key + * written to it via the dragon_ddict_write_bytes call. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait + * indefinitely. Otherwise, wait for the specified amount of time for the + * operation to complete. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ + dragonError_t + dragon_ddict_put(dragonDDictRequestDescr_t* req, const timespec_t* timeout); + + /** Other Dictionary operations will follow. **/ + +/** @} */ // end of ddict_ops group. + + + #ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/dragon/dictionary.hpp b/src/include/dragon/dictionary.hpp new file mode 100644 index 0000000..9004128 --- /dev/null +++ b/src/include/dragon/dictionary.hpp @@ -0,0 +1,118 @@ +#ifndef DRAGON_DICTIONARY_HPP +#define DRAGON_DICTIONARY_HPP + +#include +#include +#include +#include +#include +#include + +class DDictSerializable { + virtual void serialize(dragonDDictRequestDescr_t* req, const timespec_t* timeout) = 0; + virtual void deserialize(dragonDDictRequestDescr_t* req, const timespec_t* timeout) = 0; +}; + +template +class DDict { + class KVPair + { + public: + KVPair(DDict& dict, DDictSerializableKey& key) : dict(dict), key(key) {} + + void operator= (DDictSerializableValue& value) { + dragonDDictRequestDescr_t req; + dragonError_t err; + + err = dragon_ddict_create_request(&this->dict.c_dict, &req); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + + key.serialize(&req, this->dict.timeout); + + err = dragon_ddict_put(&req, this->dict.timeout); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + + value.serialize(&req, this->dict.timeout); + + err = dragon_ddict_finalize_request(&req, this->dict.timeout); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + } + + operator DDictSerializableValue () const { + dragonDDictRequestDescr_t req; + dragonError_t err; + DDictSerializableValue value; + + err = dragon_ddict_create_request(&this->dict.c_dict, &req); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + + key.serialize(&req, this->dict.timeout); + + err = dragon_ddict_get(&req, this->dict.timeout); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + + value.deserialize(&req, this->dict.timeout); + + err = dragon_ddict_finalize_request(&req, this->dict.timeout); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + + return value; + } + + private: + DDict& dict; + DDictSerializableKey& key; + }; + + public: + DDict(const char* serialized_dict, const timespec_t* timeout) { + dragonError_t err; + dragonDDictSerial_t ser; + this->timeout = NULL; + + if (serialized_dict == NULL) { + string estr = "Cannot pass NULL serialized_dict to DDict attach."; + throw DragonError(DRAGON_INVALID_ARGUMENT, estr.c_str()); + } + + this->serialized = serialized_dict; + + if (timeout != NULL) { + timeout_val = *timeout; + this->timeout = &this->timeout_val; + } + + ser.data = dragon_base64_decode(serialized_dict, &ser.len); + + err = dragon_ddict_attach(&ser, &this->c_dict, this->timeout); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + + err = dragon_ddict_serial_free(&ser); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + } + + char* serialize() { + return this->serialized.c_str(); + } + + KVPair operator[] (DDictSerializableKey& key) { + return DDict::KVPair(*this, key); + } + + private: + dragonDDictDescr_t c_dict; + std::string serialized; + timespec_t* timeout; /* to be used on all timeout operations. */ + timespec_t timeout_val; + +}; + +#endif \ No newline at end of file diff --git a/src/include/dragon/fli.h b/src/include/dragon/fli.h index c466223..823bf4f 100644 --- a/src/include/dragon/fli.h +++ b/src/include/dragon/fli.h @@ -13,14 +13,31 @@ extern "C" { #endif -/** @defgroup fli_structs API Structures +/** @defgroup fli_consts API Structures * - * The fli API structures. + * The fli API constants. * @{ */ + /** + * @brief Constant to be used when opening a send and a receive handle as the + * stream channel argument when wishing to use the main channel as a stream + * channel. Both sender and receiver must have been designed to use this + * protocol since no run-time negotiation of this is provided. + * + */ + static dragonChannelDescr_t* const STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION = (dragonChannelDescr_t*)0x0000000000001111; + +/** @} */ // end of fli_consts group. + +/** @defgroup fli_structs + * + * The FLI API structures. + * @{ + */ + /** * @brief The attributes structure of an fli adapter. * @@ -90,6 +107,10 @@ typedef struct dragonFLIRecvHandleDescr_st { /** @} */ // end of fli_structs group. +/** @defgroup fli_lifecycle Channels Lifecycle Functions + * @{ + */ + /** * @brief Initialize attributes for a FLI adapter. * @@ -107,7 +128,7 @@ dragon_fli_attr_init(dragonFLIAttr_t* attr); /** * @brief Create an FLI adapter. * - * Create an FLI adapter. An FLI adapter guarantees that a send a receive handle + * An FLI adapter guarantees that a send and receive handle * is between one sender and one receiver and will not have to deal with data * interleaving from other processes. In addition, data may be streamed between * the sender and receiver when the FLI adapter is not used in buffered mode. @@ -201,10 +222,10 @@ dragon_fli_attr_init(dragonFLIAttr_t* attr); * of the adapter create above. The application is responsible for the clean up of these * channels at the end of their life. * -* @param use_buffered_protocol if true then only a main channel should provided, no +* @param use_buffered_protocol if true then only a main channel should be provided and no * manager channel or stream channels are required. In this case all sent data is - * buffered into one message for each file write operation (all sends on an open send - * handle). The receiving side receives one message per completed file write operation. + * buffered into one message for all file write operations (all writes on an open send + * handle). The receiving side receives one message per conversation. * * @param attr is a pointer to the attributes structure that was previously * inited. If the attr arg is NULL the default attributes will be used. @@ -232,7 +253,9 @@ dragonError_t dragon_fli_destroy(dragonFLIDescr_t* adapter); /** - * @brief Serialize a FLI adapter for sharing with another process When sharing + * @brief Serialize a FLI adapter + * + * This enable sharing with another process. When sharing * an FLI adapter with another process you may use this function to create a * shareable serialized descriptor. This creates a binary string which may not * be ASCII compliant. Before sharing, if ASCII compliance is required, call a @@ -304,6 +327,50 @@ dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t dragonError_t dragon_fli_detach(dragonFLIDescr_t* adapter); +/** + * @brief Get available stream channels from adapter. + * + * Get the number of stream channels currently available in the manager + * FLI. This provides a count of the number of channels currently held + * in reserve in the FLI + * + * @param adapter is a descriptor and opaque handle to the FLI adapter. + * + * @param count is a pointer to an integer result when DRAGON_SUCCESS is + * returned. Otherwise the value will be 0. + * + * @param timeout is the amount of time to wait. A NULL timeout means to wait + * indefinitely. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_get_available_streams(dragonFLIDescr_t* adapter, uint64_t* num_streams, + const timespec_t* timeout); + +/** + * @brief Query if this FLI is a buffered FLI. + * + * Sets the is_buffered flag accordingly if the FLI is buffered or not. + * + * @param adapter is a descriptor and opaque handle to the FLI adapter. + * + * @param is_buffered is a pointer to a bool result. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ + +dragonError_t +dragon_fli_is_buffered(const dragonFLIDescr_t* adapter, bool* is_buffered); + + +/** @} */ // end of fli_lifecycle group. + +/** @defgroup fli_handles + * FLI Send/Receive Handle Management + * @{ + */ + /** * @brief Open a Send Handle * @@ -415,6 +482,20 @@ dragonError_t dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const timespec_t* timeout); +/** + * @brief Check that a Stream is completely received + * + * Check a receive handle to see if a stream has been completely received. + * + * @param recv_handle is the open receive handle to be queried. + * + * @param stream_received is set upon successful completion. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. +**/ +dragonError_t +dragon_fli_stream_received(dragonFLIRecvHandleDescr_t* recv_handle, bool* stream_received); + /** * @brief Create a file descriptor to send bytes over an FLI adapter. * @@ -525,6 +606,12 @@ dragon_fli_create_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle, int* fd_p dragonError_t dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle); +/** @} */ // end of fli_lifecycle group. + +/** @defgroup fli_sendrecv + * FLI Send/Receive Functions + * @{ + */ /** * @brief Send bytes through the FLI adapter. @@ -534,7 +621,7 @@ dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle); * * @param send_handle is an open send handle. * - * @param num_bytes is the number of bytes to be sent. + * @param num_bytes is the number of bytes to be sent and must be greater than zero. * * @param bytes is a pointer to the data to be sent. * @@ -542,7 +629,8 @@ dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle); * received by the receiving side. It does not affect the message itself. When using * the buffered protocol, only the first write into an open send handle will allow * this arg to be passed along. All other values of this arg on subsequent writes - * to an open send handle are ignored. + * to an open send handle are ignored. The value of 0xFFFFFFFFFFFFFFFF is used + * internally and is not allowed. * * @param buffer is a constant of either false (or 0 or NULL), which means use * the default behavior, or true in which case it buffers the data until @@ -574,7 +662,12 @@ dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, * received by the receiving side. It does not affect the message itself. When using * the buffered protocol, only the first write into an open send handle will allow * this arg to be passed along. All other values of this arg on subsequent writes - * to an open send handle are ignored. + * to an open send handle are ignored. The value of 0xFFFFFFFFFFFFFFFF is used + * internally and is not allowed. + * + * @param transfer_ownership is true if ownership of the managed memory should + * be transferred to the receiver. Passing false means the ownership remains + * with the sender. This also implies a copy is made on sending. * * @param timeout is a pointer to a timeout structure. If NULL, then wait forever * with no timeout. If not NULL, then wait for the specified amount of time and @@ -583,10 +676,10 @@ dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, * * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. **/ + dragonError_t dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t* mem, - uint64_t arg, const timespec_t* timeout); - + uint64_t arg, bool transfer_ownership, const timespec_t* timeout); /** * @brief Receive data from the FLI adapter. * @@ -691,6 +784,8 @@ dragonError_t dragon_fli_recv_mem(dragonFLIRecvHandleDescr_t* recv_handle, dragonMemoryDescr_t* mem, uint64_t* arg, const timespec_t* timeout); +/** @} */ // end of fli_sendrecv group. + #ifdef __cplusplus } #endif diff --git a/src/include/dragon/global_types.h b/src/include/dragon/global_types.h index 2afcf67..10f785b 100644 --- a/src/include/dragon/global_types.h +++ b/src/include/dragon/global_types.h @@ -11,14 +11,18 @@ extern "C" { #endif +#define DRAGON_UUID_NELEM 16 +#define DRAGON_UUID_SIZE (DRAGON_UUID_NELEM * sizeof(uint8_t)) + /* standard types for use across the Dragon stack */ +typedef uint64_t dragonRT_UID_t; typedef uint64_t dragonC_UID_t; typedef uint64_t dragonP_UID_t; typedef uint64_t dragonM_UID_t; typedef uint64_t dragonQ_UID_t; typedef uint64_t dragonULInt; typedef uint32_t dragonUInt; -typedef uint8_t dragonUUID[16]; +typedef uint8_t dragonUUID[DRAGON_UUID_NELEM]; typedef struct timespec timespec_t; typedef struct timeval timeval_t; diff --git a/src/include/dragon/managed_memory.h b/src/include/dragon/managed_memory.h index 055122e..e0fcc2a 100644 --- a/src/include/dragon/managed_memory.h +++ b/src/include/dragon/managed_memory.h @@ -166,7 +166,8 @@ typedef struct dragonMemoryPoolAttr_st { */ typedef struct dragonMemoryPoolDescr_st { int _original; - dragonULInt _idx; + dragonM_UID_t _idx; + dragonRT_UID_t _rt_idx; } dragonMemoryPoolDescr_t; /** @@ -257,6 +258,12 @@ dragon_memory_pool_detach(dragonMemoryPoolDescr_t * pool_descr); dragonError_t dragon_memory_pool_get_hostid(dragonMemoryPoolDescr_t * pool_descr, dragonULInt * hostid); +dragonError_t +dragon_memory_pool_runtime_is_local(dragonMemoryPoolDescr_t *pool_descr, bool *runtime_is_local); + +dragonError_t +dragon_memory_pool_get_rt_uid(dragonMemoryPoolDescr_t *pool_descr, dragonULInt *rt_uid); + dragonError_t dragon_memory_pool_get_uid_fname(const dragonMemoryPoolSerial_t * pool_ser, dragonULInt * uid_out, char ** fname_out); @@ -288,7 +295,17 @@ dragonError_t dragon_memory_pool_get_type_allocations(const dragonMemoryPoolDescr_t * pool_descr, const dragonMemoryAllocationType_t type, dragonMemoryPoolAllocations_t * allocs); +dragonError_t +dragon_memory_pool_muid(dragonMemoryPoolDescr_t* pool_descr, dragonULInt* muid); + +dragonError_t +dragon_memory_pool_get_free_size(dragonMemoryPoolDescr_t* pool_descr, uint64_t* free_size); +dragonError_t +dragon_memory_pool_get_total_size(dragonMemoryPoolDescr_t* pool_descr, uint64_t* total_size); + +dragonError_t +dragon_memory_pool_get_utilization_pct(dragonMemoryPoolDescr_t* pool_descr, double* utilization_pct); dragonError_t dragon_memory_pool_get_pointer(const dragonMemoryPoolDescr_t * pool_descr, void **base_ptr); @@ -344,6 +361,15 @@ dragon_memory_descr_clone(dragonMemoryDescr_t * newmem_descr, const dragonMemory dragonError_t dragon_memory_modify_size(dragonMemoryDescr_t * mem_descr, const size_t new_size); +dragonError_t +dragon_memory_hash(dragonMemoryDescr_t* mem_descr, dragonULInt* hash_value); + +dragonError_t +dragon_memory_equal(dragonMemoryDescr_t* mem_descr1, dragonMemoryDescr_t* mem_descr2, bool* result); + +dragonError_t +dragon_memory_copy(dragonMemoryDescr_t* from_mem, dragonMemoryDescr_t* to_mem, dragonMemoryPoolDescr_t* to_pool, const timespec_t* timeout); + #ifdef __cplusplus } #endif diff --git a/src/include/dragon/messages.hpp b/src/include/dragon/messages.hpp new file mode 100644 index 0000000..af6f0d6 --- /dev/null +++ b/src/include/dragon/messages.hpp @@ -0,0 +1,490 @@ +#ifndef messages_hpp +#define messages_hpp + +#include +#include +#include +#include +#include +#include + +//namespace DragonInfra { +class DragonError { + public: + DragonError(const dragonError_t err, const char* err_str); + ~DragonError(); + dragonError_t get_rc() const; + const char* get_err_str() const; + + private: + dragonError_t err; + std::string err_str; +}; + +class DragonMsg { + public: + DragonMsg(MessageType tc, uint64_t tag); + virtual ~DragonMsg(); + dragonError_t send(dragonFLISendHandleDescr_t* sendh, const timespec_t* timeout); + MessageType tc(); + uint64_t tag(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + MessageType mTC; + uint64_t mTag; +}; + +class DragonResponseMsg: public DragonMsg { + public: + DragonResponseMsg(MessageType tc, uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + virtual ~DragonResponseMsg(); + uint64_t ref(); + dragonError_t err(); + const char* errInfo(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mRef; + dragonError_t mErr; + std::string mErrInfo; +}; + +class SHCreateProcessLocalChannel: public DragonMsg { + public: + static const MessageType TC = SH_CREATE_PROCESS_LOCAL_CHANNEL; + + SHCreateProcessLocalChannel(uint64_t tag, uint64_t puid, const char* respFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* respFLI(); + const uint64_t puid(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mPUID; + std::string mFLI; +}; + +class SHCreateProcessLocalChannelResponse: public DragonResponseMsg { + public: + static const MessageType TC = SH_CREATE_PROCESS_LOCAL_CHANNEL_RESPONSE; + + SHCreateProcessLocalChannelResponse(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, const char* serChannel); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* serChannel(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mSerChannel; +}; + + +class SHSetKVMsg: public DragonMsg { + public: + static const MessageType TC = SH_SET_KV; + + SHSetKVMsg(uint64_t tag, const char* key, const char* value, const char* respFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* key(); + const char* value(); + const char* respFLI(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mKey; + std::string mValue; + std::string mFLI; +}; + +class SHSetKVResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = SH_SET_KV_RESPONSE; + + SHSetKVResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class SHGetKVMsg: public DragonMsg { + public: + static const MessageType TC = SH_GET_KV; + + SHGetKVMsg(uint64_t tag, const char* key, const char* respFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* key(); + const char* respFLI(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mKey; + std::string mFLI; +}; + +class SHGetKVResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = SH_GET_KV_RESPONSE; + + SHGetKVResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, const char* value); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* value(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mValue; +}; + +class DDRegisterClientMsg: public DragonMsg { + public: + static const MessageType TC = DD_REGISTER_CLIENT; + + DDRegisterClientMsg(uint64_t tag, const char* respFLI, const char* bufferedRespFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* respFLI(); + const char* bufferedRespFLI(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mFLI; + std::string bFLI; +}; + +class DDRegisterClientResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_REGISTER_CLIENT_RESPONSE; + + DDRegisterClientResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, uint64_t clientID, uint64_t numManagers); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + uint64_t numManagers(); + + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; + uint64_t mNumManagers; +}; + +class DDDestroyMsg: public DragonMsg { + public: + static const MessageType TC = DD_DESTROY; + + DDDestroyMsg(uint64_t tag, const char* respFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* respFLI(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mFLI; +}; + +class DDDestroyResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_DESTROY_RESPONSE; + + DDDestroyResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDDestroyManagerMsg: public DragonMsg { + public: + static const MessageType TC = DD_DESTROY_MANAGER; + + DDDestroyManagerMsg(uint64_t tag, const char* respFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* respFLI(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mFLI; +}; + +class DDDestroyManagerResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_DESTROY_MANAGER_RESPONSE; + + DDDestroyManagerResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDRegisterManagerMsg: public DragonMsg { + public: + static const MessageType TC = DD_REGISTER_MANAGER; + + DDRegisterManagerMsg(uint64_t tag, const char* mainFLI, const char* respFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* respFLI(); + const char* mainFLI(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + std::string mMainFLI; + std::string mRespFLI; +}; + +class DDRegisterManagerResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_REGISTER_MANAGER_RESPONSE; + + DDRegisterManagerResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDRegisterClientIDMsg: public DragonMsg { + public: + static const MessageType TC = DD_REGISTER_CLIENT_ID; + + DDRegisterClientIDMsg(uint64_t tag, uint64_t clientID, const char* respFLI, const char* bufferedRespFLI); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + const char* respFLI(); + const char* bufferedRespFLI(); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; + std::string mRespFLI; + std::string mBufferedRespFLI; +}; + +class DDRegisterClientIDResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_REGISTER_CLIENT_ID_RESPONSE; + + DDRegisterClientIDResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDPutMsg: public DragonMsg { + public: + static const MessageType TC = DD_PUT; + + DDPutMsg(uint64_t tag, uint64_t clientID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; +}; + +class DDPutResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_PUT_RESPONSE; + + DDPutResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDGetMsg: public DragonMsg { + public: + static const MessageType TC = DD_GET; + + DDGetMsg(uint64_t tag, uint64_t clientID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; +}; + +class DDGetResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_GET_RESPONSE; + + DDGetResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDPopMsg: public DragonMsg { + public: + static const MessageType TC = DD_POP; + + DDPopMsg(uint64_t tag, uint64_t clientID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; +}; + +class DDPopResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_POP_RESPONSE; + + DDPopResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDContainsMsg: public DragonMsg { + public: + static const MessageType TC = DD_CONTAINS; + + DDContainsMsg(uint64_t tag, uint64_t clientID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; +}; + +class DDContainsResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_CONTAINS_RESPONSE; + + DDContainsResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDGetLengthMsg: public DragonMsg { + public: + static const MessageType TC = DD_GET_LENGTH; + + DDGetLengthMsg(uint64_t tag, uint64_t clientID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; +}; + +class DDGetLengthResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_GET_LENGTH_RESPONSE; + + DDGetLengthResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, uint64_t length); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t length(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mLength; +}; + +class DDClearMsg: public DragonMsg { + public: + static const MessageType TC = DD_CLEAR; + + DDClearMsg(uint64_t tag, uint64_t clientID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; +}; + +class DDClearResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_CLEAR_RESPONSE; + + DDClearResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +class DDGetIteratorMsg: public DragonMsg { + public: + static const MessageType TC = DD_GET_ITERATOR; + + DDGetIteratorMsg(uint64_t tag, uint64_t clientID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; +}; + +class DDGetIteratorResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_GET_ITERATOR_RESPONSE; + + DDGetIteratorResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, uint64_t iterID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t iterID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mIterID; +}; + +class DDIteratorNextMsg: public DragonMsg { + public: + static const MessageType TC = DD_ITERATOR_NEXT; + + DDIteratorNextMsg(uint64_t tag, uint64_t clientID, uint64_t iterID); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); + uint64_t clientID(); + uint64_t iterID(); + + protected: + virtual void builder(MessageDef::Builder& msg); + + private: + uint64_t mClientID; + uint64_t mIterID; +}; + +class DDIteratorNextResponseMsg: public DragonResponseMsg { + public: + static const MessageType TC = DD_ITERATOR_NEXT_RESPONSE; + + DDIteratorNextResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo); + static dragonError_t deserialize(MessageDef::Reader& reader, DragonMsg** msg); +}; + +dragonError_t +recv_fli_msg(dragonFLIRecvHandleDescr_t* recvh, DragonMsg** msg, const timespec_t* timeout); + +const char* +dragon_msg_tc_name(uint64_t tc); + +//} + +#endif \ No newline at end of file diff --git a/src/include/dragon/return_codes.h b/src/include/dragon/return_codes.h index be47f38..574c3bd 100644 --- a/src/include/dragon/return_codes.h +++ b/src/include/dragon/return_codes.h @@ -13,6 +13,7 @@ typedef enum dragonError_st { DRAGON_SUCCESS = 0, DRAGON_INVALID_ARGUMENT, DRAGON_INVALID_OPERATION, + DRAGON_INVALID_MESSAGE, DRAGON_NOT_IMPLEMENTED, DRAGON_FAILURE, DRAGON_TIMEOUT, @@ -21,6 +22,8 @@ typedef enum dragonError_st { DRAGON_INVALID_LOCK_KIND, DRAGON_INVALID_SYNC_KIND, DRAGON_EOT, + DRAGON_KEY_NOT_FOUND, + DRAGON_NO_MORE_KEYS, DRAGON_CHANNEL_MEMORY_POOL_NONLOCAL, DRAGON_CHANNEL_ALREADY_DESTROYED, DRAGON_CHANNEL_BUFFER_ERROR, @@ -72,6 +75,7 @@ typedef enum dragonError_st { DRAGON_MEMORY_OPERATION_ATTEMPT_ON_NONLOCAL_POOL, DRAGON_MEMORY_ERRNO, DRAGON_MEMORY_FILENAME_ERROR, + DRAGON_MEMORY_POOL_FULL, DRAGON_LOCK_NOT_INITD, DRAGON_LOCK_ALREADY_INITD, DRAGON_LOCK_PTHREAD_MUTEX_INIT, diff --git a/src/include/dragon/return_codes_map_maker.py b/src/include/dragon/return_codes_map_maker.py index 54df8a1..0867200 100755 --- a/src/include/dragon/return_codes_map_maker.py +++ b/src/include/dragon/return_codes_map_maker.py @@ -4,7 +4,8 @@ def main(): outfile = open('./return_codes_map.h', 'w') outfile.write('/*\n') outfile.write('Copyright 2020, 2022 Hewlett Packard Enterprise Development LP\n\n') - outfile.write('This file was auto-generated by return_codes_map_maker.py. Do Not Edit. */\n\n') + outfile.write('This file was auto-generated by return_codes_map_maker.py. Do Not Edit.\n') + outfile.write('The source file to edit is src/include/dragon/return_codes.h\n*/\n\n') outfile.write('#ifndef DRAGON_RETURN_CODES_MAP\n') outfile.write('#define DRAGON_RETURN_CODES_MAP\n\n') outfile.write('/* This defines a static array to map\n') @@ -22,6 +23,7 @@ def main(): outfile2 = open('../../dragon/return_codes.pxd', 'w') outfile2.write('# This file was auto-generated by return_codes_map_maker.py. Do Not Edit.\n') + outfile2.write('# The source file to edit is src/include/dragon/return_codes.h\n\n') outfile2.write('cdef extern from "":\n') outfile2.write('\n') outfile2.write(' ctypedef enum dragonError_t:\n') diff --git a/src/include/dragon/utils.h b/src/include/dragon/utils.h index 31197e1..038a497 100644 --- a/src/include/dragon/utils.h +++ b/src/include/dragon/utils.h @@ -12,6 +12,9 @@ extern "C" { #endif +dragonULInt +dragon_get_local_rt_uid(); + dragonError_t dragon_set_procname(char * name); @@ -55,14 +58,29 @@ dragonError_t dragon_timespec_remaining(const timespec_t * deadline, timespec_t * remaining_timeout); char* -dragon_base64_encode(uint8_t *data, size_t input_length, size_t *output_length); +dragon_base64_encode(uint8_t *data, size_t input_length); uint8_t* -dragon_base64_decode(const char *data, size_t input_length, size_t *output_length); +dragon_base64_decode(const char *data, size_t *output_length); dragonULInt dragon_hash_ulint(dragonULInt x); +dragonULInt +dragon_hash(void* ptr, size_t num_bytes); + +bool +dragon_bytes_equal(void* ptr1, void* ptr2, size_t ptr1_numbytes, size_t ptr2_numbytes); + +dragonError_t +dragon_ls_set_kv(const unsigned char* key, const unsigned char* value, const timespec_t* timeout); + +dragonError_t +dragon_ls_get_kv(const unsigned char* key, char** value, const timespec_t* timeout); + +uint64_t +dragon_sec_to_nsec(uint64_t sec); + #ifdef __cplusplus } #endif diff --git a/src/lib/Makefile b/src/lib/Makefile index 7209388..681f4c0 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -1,24 +1,41 @@ +ifneq ("$(wildcard $(DRAGON_BASE_DIR)/.dragon-config.mk)","") + include $(DRAGON_BASE_DIR)/.dragon-config.mk +endif + CC = gcc #CFLAGS = -O0 -g -fPIC -Wall CFLAGS = -O3 -fomit-frame-pointer -fPIC -Wall CXX = g++ -CXXFLAGS = $(CFLAGS) -CPPFLAGS = -I ../include -DDRAGON_DEBUG +CPPFLAGS = -I ../include -DDRAGON_DEBUG $(CONFIG_INCLUDE) $(CONFIG_DEFINES) +CXXFLAGS = $(CFLAGS) $(CPPFLAGS) -std=c++14 LIBRARIES := libdragon.so libpmsgqueue.so libpmod.so libdragon_HEADERS := $(wildcard *.h) \ + $(wildcard gpu/*.hpp) \ $(filter-out ../include/dragon/return_codes_map.h,$(wildcard ../include/dragon/*.h)) \ - ../include/dragon/return_codes_map.h -libdragon_SOURCES := $(wildcard *.c) $(wildcard *.cpp) -libdragon_OBJECTS := $(addsuffix .o,$(basename $(libdragon_SOURCES))) + ../include/dragon/return_codes_map.h \ + ../include/dragon/message_tcs.h + +libdragon_CAPNP_SOURCES = $(wildcard *.capnp) +libdragon_SOURCES := $(wildcard *.c) $(wildcard *.cpp) gpu/gpu.cpp $(CONFIG_SOURCES) +libdragon_OBJECTS := $(addsuffix .o,$(libdragon_CAPNP_SOURCES)) $(addsuffix .o,$(basename $(libdragon_SOURCES))) + .PHONY: all -all: $(LIBRARIES) +all: message_defs.capnp.o ../include/dragon/message_tcs.h ../include/dragon/return_codes_map.h $(LIBRARIES) $(libdragon_OBJECTS): $(libdragon_HEADERS) +message_defs.capnp.o: + capnp compile -oc++ message_defs.capnp + cp message_defs.capnp.h ../include/dragon + $(CXX) -c $(CXXFLAGS) message_defs.capnp.c++ + +../include/dragon/message_tcs.h: ../dragon/infrastructure/messages.py + python3 message_tcs_to_enum.py + .PHONY: ../include/dragon/return_codes_map.h ../include/dragon/return_codes_map.h: ../include/%: $(MAKE) -C ../include $* @@ -27,7 +44,7 @@ libdragon.so: $(libdragon_OBJECTS) libdragon.so: LDLIBS += -lstdc++ -lrt libdragon.so: %.so: - $(LINK.c) -shared -Wl,-soname,$(notdir $@) -o $@ $^ $(LDLIBS) + $(LINK.c) -L . -shared -Wl,-soname,$(notdir $@) -o $@ $^ -Wl,--whole-archive libcapnp.a libkj.a -Wl,--no-whole-archive $(LDLIBS) libpmod.so: libdragon.so ln -sf libdragon.so libpmod.so @@ -43,5 +60,5 @@ $(filter-out libpmod.so,$(filter-out libdragon.so,$(LIBRARIES))): lib%.so: ../dr .PHONY: clean clean: - $(RM) $(LIBRARIES) *.o libpmod.so + $(RM) $(LIBRARIES) *.o libpmod.so *.capnp.c++ *.capnp.h *.capnp.o $(MAKE) -C ../dragon/launcher/src clean diff --git a/src/lib/_bcast.h b/src/lib/_bcast.h index 680aff9..b6007c3 100644 --- a/src/lib/_bcast.h +++ b/src/lib/_bcast.h @@ -19,7 +19,7 @@ #define DRAGON_BCAST_UMAP_SEED 487 #define DRAGON_BCAST_MAX_SERIALIZED_LEN (DRAGON_MEMORY_MAX_SERIALIZED_LEN+(DRAGON_BCAST_SERIAL_NULINTS*sizeof(dragonULInt))) #define DRAGON_BCAST_SPIN_CHECK_TIMEOUT_ITERS 10000UL -#define DRAGON_BCAST_ADAPTIVE_WAIT_TO_IDLE 50 +#define DRAGON_BCAST_ADAPTIVE_WAIT_TO_IDLE 10 #define DRAGON_BCAST_DESTROY_TIMEOUT_SEC 10 /* attributes and header info embedded into a BCast object NOTE: This must match diff --git a/src/lib/_bitset.h b/src/lib/_bitset.h index 5eaef6a..42433f9 100644 --- a/src/lib/_bitset.h +++ b/src/lib/_bitset.h @@ -65,6 +65,9 @@ dragon_bitset_init(void* ptr, dragonBitSet_t* set, const size_t num_bits); dragonError_t dragon_bitset_destroy(dragonBitSet_t* set); +dragonError_t +dragon_bitset_clear(dragonBitSet_t* set); + dragonError_t dragon_bitset_attach(void* ptr, dragonBitSet_t* set); diff --git a/src/lib/_channelsets.h b/src/lib/_channelsets.h index b91d5ee..12ffa70 100644 --- a/src/lib/_channelsets.h +++ b/src/lib/_channelsets.h @@ -28,6 +28,7 @@ typedef struct dragonChannelSet_st { int num_channels; bool first_poll_call; uint8_t event_mask; + pthread_t tid; } dragonChannelSet_t; typedef struct dragonChannelSetCallbackArg_st { @@ -47,4 +48,4 @@ static_assert(sizeof(dragonChannelEventNotification_t) == sizeof(dragonChannelSe } #endif -#endif \ No newline at end of file +#endif diff --git a/src/lib/_ddict.h b/src/lib/_ddict.h new file mode 100644 index 0000000..092bb5d --- /dev/null +++ b/src/lib/_ddict.h @@ -0,0 +1,63 @@ +#ifndef HAVE_DRAGON_DDICT_INTERNAL_H +#define HAVE_DRAGON_DDICT_INTERNAL_H + +#include "umap.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define DRAGON_DDICT_UMAP_SEED 1776 + + +// Struct to handle buffering keys before sending +typedef struct dragonDDictBufAlloc_st { + uint8_t * data; + size_t num_bytes; + struct dragonDDictBufAlloc_st * next; +} dragonDDictBufAlloc_t; + +/** + * @brief An enum DDict request types + * +*/ +typedef enum dragonDDictReqType_st { + DRAGON_DDICT_NO_OP, + DRAGON_DDICT_GET_REQ, + DRAGON_DDICT_PUT_REQ, + DRAGON_DDICT_CONTAINS_REQ, + DRAGON_DDICT_FINALIZED +} dragonDDictReqType_t; + +typedef struct dragonDDict_st { + dragonDDictSerial_t ser; // stored for easy access by serialize call. + dragonFLIDescr_t orchestrator_fli; // FLI handle for orchestrator messages + dragonFLIDescr_t * manager_flis; // FLI handles for managers + dragonFLIDescr_t respFLI; // This handles non-buffered, streaming responses to requests + dragonFLIDescr_t bufferedRespFLI; // This handles buffered responses to requests + char* respFLIStr; // Needed for messaging between client and managers/orch. + char* bufferedRespFLIStr; // Needed for message between client and managers/orch. + uint64_t clientID; + size_t num_managers; + dragonULInt dd_uid; // UID for umap storage +} dragonDDict_t; + +typedef struct dragonDDictReq_st { + dragonDDict_t* ddict; + dragonULInt dd_uid; + size_t buffer_size; + dragonDDictBufAlloc_t * buffered_allocs; // Linked list buffer for key + uint8_t * key_data; // Hold onto key data (may be useful later, free on finalize) + dragonULInt key_hash; // Hold onto key hash + dragonDDictReqType_t op_type; // What operation type for error checking + dragonFLISendHandleDescr_t sendh; // DDict manager send handle + dragonFLIRecvHandleDescr_t recvh; // DDict manager receive handle +} dragonDDictReq_t; + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/lib/_fli.h b/src/lib/_fli.h index 8cdbd6d..5ffcf40 100644 --- a/src/lib/_fli.h +++ b/src/lib/_fli.h @@ -26,6 +26,7 @@ typedef struct dragonFLI_st { bool has_main_ch; /* true if main_ch is initialized and used. */ bool has_mgr_ch; /* true if mgr_ch is initialized and used. */ bool use_buffered_protocol; /* true if not using stream channels */ + bool was_attached; /* true if attach is used */ } dragonFLI_t; /* buffered allocation used for the buffered protocol on these diff --git a/src/lib/_hashtable.h b/src/lib/_hashtable.h index 323c78a..3fbc8d0 100644 --- a/src/lib/_hashtable.h +++ b/src/lib/_hashtable.h @@ -35,7 +35,8 @@ extern "C" { typedef struct dragonHashtableHeader_st { uint64_t num_slots; - uint64_t* count_ptr; + uint64_t* num_kvs; + uint64_t* num_placeholders; uint64_t key_len; uint64_t value_len; uint64_t* armor1; diff --git a/src/lib/_managed_memory.h b/src/lib/_managed_memory.h index 3596e9d..9a54e26 100644 --- a/src/lib/_managed_memory.h +++ b/src/lib/_managed_memory.h @@ -33,7 +33,7 @@ extern "C" { #define DRAGON_MEMORY_POOL_UMAP_SEED 7 #define DRAGON_MEMORY_MEM_UMAP_SEED 9 -#define DRAGON_MEMORY_POOLSER_NULINTS 4 +#define DRAGON_MEMORY_POOLSER_NULINTS 5 #define DRAGON_MEMORY_POOL_MAX_SERIALIZED_LEN (DRAGON_MEMORY_MAX_FILE_NAME_LENGTH + (sizeof(dragonULInt) * DRAGON_MEMORY_POOLSER_NULINTS)) #define DRAGON_MEMORY_MAX_SERIALIZED_LEN (DRAGON_MEMORY_POOL_MAX_SERIALIZED_LEN + (DRAGON_MEMORY_MEMSER_NULINTS * (sizeof(dragonULInt)))) #define DRAGON_MEMORY_MEMSER_NULINTS 4 @@ -71,6 +71,7 @@ typedef struct dragonMemoryPoolHeader_st { be able to serialize a remote memory pool */ typedef struct dragonRemoteMemoryPoolInfo_st { dragonULInt hostid; + dragonULInt rt_uid; dragonULInt m_uid; dragonULInt mem_type; dragonULInt manifest_len; @@ -88,6 +89,7 @@ typedef struct dragonMemoryPool_st { int mfd; // Manifest file descriptor size_t data_requested_size; size_t manifest_requested_size; // the max number of manifest records + bool runtime_is_local; void * local_dptr; // Data blob pointer, if == NULL then this pool is non-local. void * mptr; // Manifest blob pointer dragonMemoryPoolHeap_t heap; diff --git a/src/lib/bcast.c b/src/lib/bcast.c index 3d0870c..fb29909 100644 --- a/src/lib/bcast.c +++ b/src/lib/bcast.c @@ -356,6 +356,9 @@ _spin_wait(dragonBCast_t * handle, void * num_waiting_ptr, timespec_t* end_time, atomic_uint expected = 0UL; timespec_t now_time; + /* Assume we will find a spot. This makes sure we don't miss a spinner later. */ + atomic_fetch_add(handle->header.spin_list_count, 1L); + while ((found == false) && (idx < *(handle->header.spin_list_sz))) { if (atomic_compare_exchange_strong(&(handle->header.spin_list[idx]), &expected, 1UL)) found = true; @@ -372,20 +375,20 @@ _spin_wait(dragonBCast_t * handle, void * num_waiting_ptr, timespec_t* end_time, switch over to idle wait when a process can't get a spot in the spin list. This will lead to the process waiting until all other spin waiters are done, but there is no guaranteed ordering among spin waiters either. */ + atomic_fetch_add(handle->header.spin_list_count, -1L); return _idle_wait(handle, num_waiting_ptr, end_time, release_fun, release_arg, NULL); } /* increment the num_waiting atomically, but also get the new value for num_waiting */ - dragonULInt my_num_waiting = atomic_fetch_add(handle->header.num_waiting, 1L) + 1; + dragonULInt my_num_waiting = atomic_fetch_add(handle->header.num_waiting, 1UL) + 1; if (*(handle->header.sync_type) == DRAGON_SYNC && my_num_waiting > *(handle->header.sync_num)) { atomic_fetch_add(handle->header.num_waiting, -1L); atomic_store(&handle->header.spin_list[idx], 0UL); + atomic_fetch_add(handle->header.spin_list_count, -1L); err_return(DRAGON_INVALID_OPERATION, "There cannot be more waiters than the specified sync number on a synchronized bcast"); } - atomic_fetch_add(handle->header.spin_list_count, 1L); - /* When the process is now officially waiting, the resource release (back in the caller) can occur. The resource release is optional. */ if (release_fun != NULL) @@ -437,9 +440,12 @@ _spin_wait(dragonBCast_t * handle, void * num_waiting_ptr, timespec_t* end_time, if (wait_mode != NULL && *wait_mode == DRAGON_ADAPTIVE_WAIT && number_of_yields == DRAGON_BCAST_ADAPTIVE_WAIT_TO_IDLE) { /* We did a spin wait for a bit, now it's time for idle wait. */ - atomic_store(&handle->header.spin_list[idx], 0UL); - atomic_fetch_add(handle->header.spin_list_count, -1L); - return _idle_wait(handle, num_waiting_ptr, end_time, release_fun, release_arg, &my_num_waiting); + expected = 1UL; + if (atomic_compare_exchange_strong(&(handle->header.spin_list[idx]), &expected, 0UL)) { + atomic_fetch_add(handle->header.spin_list_count, -1L); + return _idle_wait(handle, num_waiting_ptr, end_time, release_fun, release_arg, &my_num_waiting); + } else + break; } sched_yield(); @@ -1629,7 +1635,7 @@ dragon_bcast_notify_callback(dragonBCastDescr_t* bd, void* user_def_ptr, const d * * @param bd The BCast's descriptor handle. * - * @param timer A timeout value. If the triggerd process does not complete its triggering within the timeout + * @param timer A timeout value. If the triggered process does not complete its triggering within the timeout * period, this process will get a DRAGON_TIMEOUT return code. A value of NULL means to wait * forever for triggering to complete. * @@ -1768,36 +1774,29 @@ dragon_bcast_trigger_some(dragonBCastDescr_t* bd, int num_to_trigger, const time will change in the object as spinners wake up */ size_t current_spinner_count = (atomic_uint)*handle->header.spin_list_count; int idx = 0; + atomic_uint expected = 0UL; while ((num_spinners < current_spinner_count) && (idx < *(handle->header.spin_list_sz)) && (num_spinners < num_to_trigger)) { - if (atomic_load(&handle->header.spin_list[idx]) == 1UL) { + expected = 1UL; + if (atomic_compare_exchange_strong(&(handle->header.spin_list[idx]), &expected, 2UL)) num_spinners+=1; - handle->header.spin_list[idx] = 2UL; - } idx += 1; } - if (num_spinners < num_to_trigger) { - uint32_t num_to_wake = num_to_trigger - num_spinners; - *handle->header.allowable_count = num_to_wake; - - // long num_woke = - syscall(SYS_futex, triggering_ptr, FUTEX_WAKE, *handle->header.allowable_count, NULL, NULL, 0); - // It would be nice to make this check below but we can't rely on the value of - // num_woke, because there may be idle waiting processes that don't actually - // make it to the futex wait before the futex wake was executed above. That's - // perfectly OK, but it means that num_woke may be less than num_to_wake. - // if (num_to_wake != num_woke) { - // char err_str[200]; - // snprintf(err_str, 199, "BCast object failed to wake %ld idle waiters. It woke %ld instead.", num_to_wake, num_woke); - // err_return(DRAGON_FAILURE, err_str); - // } - } - /* the while loop below waits for all the triggered to pick up their payload. */ size_t check_timeout_when_0 = 1; + uint32_t num_to_wake = num_to_trigger - num_spinners; + + /* As each process wakes up it will decrement allowable count. */ + *handle->header.allowable_count = num_to_wake; + + while ((atomic_load(num_triggered_ptr) < num_to_trigger) && (atomic_load(handle->header.num_waiting) > 0UL)) { + + if (num_to_wake > 0) { + long num_woke = syscall(SYS_futex, triggering_ptr, FUTEX_WAKE, num_to_wake, NULL, NULL, 0); + num_to_wake = num_to_wake - num_woke; + } - while ((atomic_load(num_triggered_ptr) < num_to_trigger) && (atomic_load(handle->header.num_waiting) > 0)) { if (timer != NULL) { if (check_timeout_when_0 == 0) { clock_gettime(CLOCK_MONOTONIC, &now_time); diff --git a/src/lib/bitset.c b/src/lib/bitset.c index 6405a70..d916ea8 100644 --- a/src/lib/bitset.c +++ b/src/lib/bitset.c @@ -110,7 +110,34 @@ dragon_bitset_init(void* ptr, dragonBitSet_t* set, const size_t num_bits) *size_ptr = num_bits; set->size = num_bits; set->data = (char*) (ptr + sizeof(size_t)); - size_t max_idx = (num_bits + 7) / 8; + + dragon_bitset_clear(set); // Called internally this will not fail. + + no_err_return(DRAGON_SUCCESS); +} + +/** @brief Clear a bitset to all zeroes. + * + * This API provides a bitset implementation that resides in a pre-allocated blob of memory. + * This datatype does not do any dynamic allocation of memory on its own. The bitset is a set + * of integers ranging from to to num_bits-1. + * + * A bitset must be cleared (i.e. set to zeroes) by calling this. The BitSet should have been + * previously initialized. + * + * @param set A pointer to a handle to an initialized bitset. + * + * @return + * * **DRAGON_SUCCESS** It did its job. + * * **DRAGON_BITSET_NULL_POINTER** The set was a null-pointer. + */ +dragonError_t +dragon_bitset_clear(dragonBitSet_t* set) +{ + if (set == NULL) + err_return(DRAGON_BITSET_NULL_POINTER,"The dragonBitSet handle pointer is NULL."); + + size_t max_idx = (set->size + 7) / 8; for (size_t k = 0; kdata[k] = 0; diff --git a/src/lib/channels.c b/src/lib/channels.c index d006673..5ee0faa 100644 --- a/src/lib/channels.c +++ b/src/lib/channels.c @@ -9,7 +9,6 @@ #include #include -static int dg_num_gateways = 0; static dragonMap_t* dg_channels = NULL; static dragonList_t* dg_gateways = NULL; @@ -88,30 +87,31 @@ _channel_from_descr(const dragonChannelDescr_t* ch_descr, dragonChannel_t** ch) err_return(DRAGON_INVALID_ARGUMENT, "invalid channel descriptor"); /* find the entry in our pool map for this descriptor */ - dragonError_t err = dragon_umap_getitem(dg_channels, ch_descr->_idx, (void*)ch); + dragonError_t err = dragon_umap_getitem_multikey(dg_channels, ch_descr->_rt_idx, ch_descr->_idx, (void*)ch); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to find item in channels umap"); no_err_return(DRAGON_SUCCESS); } -/* given a c_uid, check if we already are attached to that channel and update - * the descriptor for use */ +/* given an rt_uid and c_uid, check if we already are attached to that channel and + * update the descriptor for use */ static dragonError_t -_channel_descr_from_c_uid(const dragonC_UID_t c_uid, dragonChannelDescr_t* ch_descr) +_channel_descr_from_uids(const dragonRT_UID_t rt_uid, const dragonC_UID_t c_uid, dragonChannelDescr_t* ch_descr) { if (ch_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "invalid channel descriptor"); /* find the entry in our pool map for this descriptor */ dragonChannel_t* channel; - dragonError_t err = dragon_umap_getitem(dg_channels, c_uid, (void*)&channel); + dragonError_t err = dragon_umap_getitem_multikey(dg_channels, rt_uid, c_uid, (void*)&channel); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to find item in channels umap"); /* update the descriptor with the m_uid key and note this cannot be original */ // ch_descr->_original = 0; /* @MCB: Not used yet */ + ch_descr->_rt_idx = rt_uid; ch_descr->_idx = c_uid; no_err_return(DRAGON_SUCCESS); @@ -137,7 +137,7 @@ _add_umap_channel_entry(const dragonChannelDescr_t* ch, const dragonChannel_t* n append_err_return(err, "failed to create umap for channels"); } - err = dragon_umap_additem(dg_channels, ch->_idx, newch); + err = dragon_umap_additem_multikey(dg_channels, ch->_rt_idx, ch->_idx, newch); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to insert item into channels umap"); @@ -158,39 +158,37 @@ _add_umap_channel_entry(const dragonChannelDescr_t* ch, const dragonChannel_t* n * @returns DRAGON_SUCCESS or an error indicating the problem. */ static dragonError_t -_register_gateway(const dragonChannel_t* ch) +_register_gateway(const dragonChannel_t* ch, dragonList_t** gateways) { dragonError_t err; /* register this channel in our umap */ - if (dg_gateways == NULL) { + if (*gateways == NULL) { /* this is a process-global variable and has no specific call to be * destroyed */ - dg_gateways = malloc(sizeof(dragonList_t)); - if (dg_gateways == NULL) + *gateways = malloc(sizeof(dragonList_t)); + if (*gateways == NULL) err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Cannot allocate ulist for gateway channels."); - err = dragon_ulist_create(dg_gateways); + err = dragon_ulist_create(*gateways); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to create ulist for gateway channels"); } - err = dragon_ulist_additem(dg_gateways, ch); + err = dragon_ulist_additem(*gateways, ch); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to insert item into gateway channels list"); - ++dg_num_gateways; - no_err_return(DRAGON_SUCCESS); } static dragonError_t -_unregister_gateway(const dragonChannel_t* ch) +_unregister_gateway(const dragonChannel_t* ch, dragonList_t *gateways) { - if (dg_gateways == NULL) + if (gateways == NULL) err_return(DRAGON_CHANNEL_NO_GATEWAYS, "no gateways have been registered"); - dragonError_t err = dragon_ulist_delitem(dg_gateways, ch); + dragonError_t err = dragon_ulist_delitem(gateways, ch); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to delete item from gateway channels list"); @@ -312,7 +310,7 @@ _assign_header(const dragonC_UID_t c_uid, const dragonChannelAttr_t* attr, drago dragonError_t err; // clang-format off - *(ch->header.c_uid) = (dragonULInt)c_uid; + *(ch->header.c_uid) = (dragonC_UID_t)c_uid; *(ch->header.bytes_per_msg_block) = (dragonULInt)attr->bytes_per_msg_block; *(ch->header.capacity) = (dragonULInt)attr->capacity; *(ch->header.lock_type) = (dragonULInt)attr->lock_type; @@ -1343,7 +1341,7 @@ _send_msg(dragonChannel_t* channel, const dragonUUID sendhid, const dragonMessag channel_full = true; /* This is used in implementing the blocking receive and must be done here - to trigger a blocked sender if one exists */ + to trigger a blocked receiver if one exists */ dragon_bcast_trigger_one(&channel->recv_bcast, NULL, NULL, 0); /* This must be done here to prevent the event bcast list from changing @@ -1499,6 +1497,7 @@ _get_msg(dragonChannel_t* channel, dragonMessage_t* msg_recv, timespec_t* end_ti dragonMemorySerial_t mem_ser; mem_ser.data = (uint8_t*)channel->msg_blks_ptrs[mblk]; mem_ser.len = src_bytes; + err = dragon_memory_attach(&mem_descr, &mem_ser); if (err != DRAGON_SUCCESS) { _release_ot_lock(channel); @@ -1585,6 +1584,7 @@ _get_msg(dragonChannel_t* channel, dragonMessage_t* msg_recv, timespec_t* end_ti dragonMemorySerial_t mem_ser; mem_ser.len = src_bytes; mem_ser.data = (uint8_t*)channel->msg_blks_ptrs[mblk]; + err = dragon_memory_attach(msg_mem, &mem_ser); if (err != DRAGON_SUCCESS) append_err_return(err, "cannot attach to payload memory"); @@ -1720,6 +1720,7 @@ _peek_msg(dragonChannel_t* channel, dragonMessage_t* msg_peek) dragonMemorySerial_t mem_ser; mem_ser.len = src_bytes; mem_ser.data = (uint8_t*)channel->msg_blks_ptrs[mblk]; + err = dragon_memory_attach(&msg_mem_descr, &mem_ser); if (err != DRAGON_SUCCESS) append_err_return(err, "cannot attach to payload memory"); @@ -1957,6 +1958,33 @@ _channel_is_masquerading(const dragonChannelDescr_t* ch) return false; } +static dragonError_t +_attach_to_gateway(char *ip_addrs_key, dragonChannelDescr_t *gw_ch) +{ + dragonError_t err; + char err_str[400]; + + char *gw_str = getenv(ip_addrs_key); + + if (gw_str == NULL) { + snprintf(err_str, 399, "NULL gateway descriptor for key=%s", ip_addrs_key); + err_return(DRAGON_INVALID_ARGUMENT, err_str); + } + + dragonChannelSerial_t gw_ser; + gw_ser.data = dragon_base64_decode(gw_str, &gw_ser.len); + if (gw_ser.data == NULL) { + err_return(DRAGON_INVALID_ARGUMENT, "failed to decode string specifying gateway descriptor"); + } + + err = dragon_channel_attach(&gw_ser, gw_ch); + if (err != DRAGON_SUCCESS) { + append_err_return(DRAGON_INVALID_ARGUMENT, err_str); + } + + no_err_return(DRAGON_SUCCESS); +} + static dragonError_t _get_gw_idx(const dragonChannelDescr_t *ch, dragonChannelOpType_t op_type, int *gw_idx) { @@ -1979,25 +2007,91 @@ _get_gw_idx(const dragonChannelDescr_t *ch, dragonChannelOpType_t op_type, int * * \___________________/ \___________________/ \___________________/ \___________________/ * nic 0 nic 1 nic 2 nic 3 */ - if (dg_num_gateways == 1) { + size_t num_gws = dragon_ulist_get_size(dg_gateways); + + if (num_gws == 1) { *gw_idx = 0; } else { - int num_gw_groups = dg_num_gateways / dg_num_gateway_types; + int num_gw_groups = num_gws / dg_num_gateway_types; int my_gw_group = dragon_hash_ulint(target_hostid) % num_gw_groups; *gw_idx = (dg_num_gateway_types * my_gw_group) + op_type; } - if (*gw_idx < 0 || dg_num_gateways <= *gw_idx) { + if (*gw_idx < 0 || num_gws <= *gw_idx) { char err_str[100]; snprintf(err_str, 99, - "Invalid gateway index: gateway idx=%d, num gateways=%d.", - *gw_idx, dg_num_gateways); + "Invalid gateway index: gateway idx=%d, num gateways=%lu.", + *gw_idx, num_gws); append_err_return(err, err_str); } no_err_return(DRAGON_SUCCESS); } +static dragonError_t +_get_gateway(const dragonChannelDescr_t *ch_descr, dragonChannelOpType_t op_type, dragonChannel_t** gw_channel) +{ + dragonError_t err; + dragonChannel_t *channel = NULL; + bool runtime_is_local; + + err = _channel_from_descr(ch_descr, &channel); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get channel from descriptor."); + + err = dragon_memory_pool_runtime_is_local(&channel->pool, &runtime_is_local); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not determine if channel is hosted by local runtime."); + + if (runtime_is_local) { + int gw_idx; + + if (dg_gateways == NULL) { + char err_str[400]; + dragonULInt rt_uid; + + err = dragon_memory_pool_get_rt_uid(&channel->pool, &rt_uid); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get pool's rt_uid."); + + snprintf(err_str, 399, + "There are no registered gateway channels and the channel is not local, " + "local and remote runtime ip addrs: %lu and %lu", + dragon_get_local_rt_uid(), + rt_uid); + err_return(DRAGON_CHANNEL_NO_GATEWAYS, err_str); + } + + err = _get_gw_idx(ch_descr, op_type, &gw_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get a gateway index."); + + err = dragon_ulist_get_by_idx(dg_gateways, gw_idx, (void **) gw_channel); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get gateway channel."); + } else { + dragonULInt rt_uid; + dragonChannelDescr_t gw_ch; + char ip_addrs_key[64]; + + err = dragon_memory_pool_get_rt_uid(&channel->pool, &rt_uid); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get pool's rt_uid."); + + sprintf(ip_addrs_key, "DRAGON_RT_UID__%lu", rt_uid); + + err = _attach_to_gateway(ip_addrs_key, &gw_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach to gateway channel."); + + err = _channel_from_descr(&gw_ch, gw_channel); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get gateway channel from descriptor."); + } + + no_err_return(DRAGON_SUCCESS); +} + // BEGIN USER API /** @defgroup channels_lifecycle Channels Lifecycle Functions @@ -2228,8 +2322,7 @@ dragon_channel_create(dragonChannelDescr_t* ch, const dragonC_UID_t c_uid, size_t alloc_size = _channel_allocation_size(attr); /* allocate the space using the alloc type interface with a channel type */ - err = dragon_memory_alloc_type(&newch->main_mem, &newch->pool, alloc_size, DRAGON_MEMORY_ALLOC_CHANNEL, - c_uid); + err = dragon_memory_alloc_type(&newch->main_mem, &newch->pool, alloc_size, DRAGON_MEMORY_ALLOC_CHANNEL, c_uid); if (err != DRAGON_SUCCESS) { append_err_noreturn("unable to allocate memory for channel from memory pool"); goto ch_fail; @@ -2282,6 +2375,7 @@ dragon_channel_create(dragonChannelDescr_t* ch, const dragonC_UID_t c_uid, /* register this channel in our umap using the c_uid as the key */ ch->_idx = c_uid; + ch->_rt_idx = dragon_get_local_rt_uid(); err = _add_umap_channel_entry(ch, newch); if (err != DRAGON_SUCCESS) { @@ -2343,7 +2437,7 @@ dragon_channel_destroy(dragonChannelDescr_t* ch) int allocation_exists; err = dragon_memory_pool_allocation_exists(&channel->pool, DRAGON_MEMORY_ALLOC_CHANNEL, - *channel->header.c_uid, &allocation_exists); + (dragonULInt) *channel->header.c_uid, &allocation_exists); if (allocation_exists == 0) { err_return(DRAGON_CHANNEL_ALREADY_DESTROYED, "This channel allocation does not exist and was likely already destroyed."); @@ -2373,7 +2467,7 @@ dragon_channel_destroy(dragonChannelDescr_t* ch) append_err_return(err, "cannot free the serialized descriptor"); /* remove the item from the umap */ - err = dragon_umap_delitem(dg_channels, ch->_idx); + err = dragon_umap_delitem_multikey(dg_channels, ch->_rt_idx, ch->_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to delete item from channels umap"); @@ -2480,39 +2574,29 @@ dragon_channel_attach(const dragonChannelSerial_t* ch_ser, dragonChannelDescr_t* dragonC_UID_t c_uid = (dragonC_UID_t)*sptr; sptr++; - /* check if we have already attached to the c_uid */ - dragonError_t err = _channel_descr_from_c_uid(c_uid, ch); - if (err == DRAGON_SUCCESS) { - dragonChannel_t* channel; - _channel_from_descr(ch, &channel); - /* The proc_flags are creation attributes. If the same process created - the channel, and then attached to it, the attached instance should not - inherit the flags specified when the channel was created. */ - channel->proc_flags = DRAGON_CHANNEL_FLAGS_NONE; - atomic_fetch_add_explicit(&(channel->ref_cnt), 1, memory_order_acq_rel); - no_err_return(DRAGON_SUCCESS); - } - /* we'll need to construct a new channel structure (freed in either * channel_destroy or channel_detach) */ dragonChannel_t* channel = malloc(sizeof(dragonChannel_t)); if (channel == NULL) err_return(DRAGON_INTERNAL_MALLOC_FAIL, "unable to allocate new channel structure"); + /* These are create attributes only. So attaching in a different process + should not inherit the flags specified when the channel was created. */ + channel->proc_flags = DRAGON_CHANNEL_FLAGS_NONE; + /* attach to the memory descriptor */ + dragonError_t err; dragonMemorySerial_t mem_ser; + mem_ser.len = ch_ser->len - DRAGON_CHANNEL_CHSER_NULINTS * sizeof(dragonULInt); mem_ser.data = (uint8_t*)sptr; + err = dragon_memory_attach(&channel->main_mem, &mem_ser); if (err != DRAGON_SUCCESS) { append_err_noreturn("cannot attach to memory with serialized descriptor"); goto ch_attach_fail; } - /* These are create attributes only. So attaching in a different process - should not inherit the flags specified when the channel was created. */ - channel->proc_flags = DRAGON_CHANNEL_FLAGS_NONE; - /* get the pool descriptor and pointer from the memory descriptor */ err = dragon_memory_get_pool(&channel->main_mem, &channel->pool); if (err != DRAGON_SUCCESS) { @@ -2520,6 +2604,27 @@ dragon_channel_attach(const dragonChannelSerial_t* ch_ser, dragonChannelDescr_t* goto ch_attach_mem_fail; } + dragonRT_UID_t rt_uid; + + err = dragon_memory_pool_get_rt_uid(&channel->pool, &rt_uid); + if (err != DRAGON_SUCCESS) { + append_err_noreturn("cannot get rt_uid from memory pool"); + goto ch_attach_mem_fail; + } + + /* check if we have already attached to the rt_uid/c_uid pair */ + err = _channel_descr_from_uids(rt_uid, c_uid, ch); + if (err == DRAGON_SUCCESS) { + dragonChannel_t* channel; + _channel_from_descr(ch, &channel); + /* The proc_flags are creation attributes. If the same process created + the channel, and then attached to it, the attached instance should not + inherit the flags specified when the channel was created. */ + channel->proc_flags = DRAGON_CHANNEL_FLAGS_NONE; + atomic_fetch_add_explicit(&(channel->ref_cnt), 1, memory_order_acq_rel); + no_err_return(DRAGON_SUCCESS); + } + if (dragon_memory_pool_is_local(&channel->pool)) { err = dragon_memory_get_pointer(&channel->main_mem, &channel->local_main_ptr); @@ -2568,7 +2673,9 @@ dragon_channel_attach(const dragonChannelSerial_t* ch_ser, dragonChannelDescr_t* atomic_store(&(channel->ref_cnt), 1); /* register this channel in our umap using the c_uid as the key */ + ch->_rt_idx = rt_uid; ch->_idx = c_uid; + err = _add_umap_channel_entry(ch, channel); if (err != DRAGON_SUCCESS) { append_err_noreturn("failed to insert item into channels umap"); @@ -2632,6 +2739,7 @@ dragon_channel_detach(dragonChannelDescr_t* ch) append_err_return(err, "invalid channel descriptor"); long int ref_cnt = atomic_fetch_sub_explicit(&(channel->ref_cnt), 1, memory_order_acq_rel) - 1; + if (ref_cnt > 0) no_err_return(DRAGON_SUCCESS); @@ -2666,7 +2774,7 @@ dragon_channel_detach(dragonChannelDescr_t* ch) } // remove channel from umap - err = dragon_umap_delitem(dg_channels, ch->_idx); + err = dragon_umap_delitem_multikey(dg_channels, ch->_rt_idx, ch->_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not remove channel from umap"); @@ -2918,24 +3026,16 @@ dragon_channel_sendh(const dragonChannelDescr_t* ch, dragonChannelSendh_t* ch_sh if (dragon_channel_is_local(ch)) { ch_sh->_gw._idx = 0; + ch_sh->_gw._rt_idx = 0; } else { dragonChannel_t* gw_channel; - if (dg_gateways == NULL) - err_return(DRAGON_CHANNEL_NO_GATEWAYS, "There are no registered gateway channels and the channel " - "is not local."); - - int gw_idx; - - err = _get_gw_idx(&ch_sh->_ch, DRAGON_OP_TYPE_SEND_MSG, &gw_idx); + err = _get_gateway(&ch_sh->_ch, DRAGON_OP_TYPE_SEND_MSG, &gw_channel); if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not get a gateway index."); - - err = dragon_ulist_get_by_idx(dg_gateways, gw_idx, (void **) &gw_channel); - if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not get gateway channel."); + append_err_return(err, "Could not get a gateway channel."); ch_sh->_gw._idx = *((dragonC_UID_t*)gw_channel->header.c_uid); + ch_sh->_gw._rt_idx = dragon_get_local_rt_uid(); } /* TODO: at the moment, there are no data structures associated with the @@ -3095,24 +3195,16 @@ dragon_channel_recvh(const dragonChannelDescr_t* ch, dragonChannelRecvh_t* ch_rh if (dragon_channel_is_local(ch)) { ch_rh->_gw._idx = 0; + ch_rh->_gw._rt_idx = 0; } else { dragonChannel_t* gw_channel; - if (dg_gateways == NULL) - err_return(DRAGON_CHANNEL_NO_GATEWAYS, "There are no registered gateway channels and the channel " - "is not local."); - - int gw_idx; - - err = _get_gw_idx(&ch_rh->_ch, DRAGON_OP_TYPE_GET_MSG, &gw_idx); - if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not get a gateway index."); - - err = dragon_ulist_get_by_idx(dg_gateways, gw_idx, (void **) &gw_channel); + err = _get_gateway(&ch_rh->_ch, DRAGON_OP_TYPE_GET_MSG, &gw_channel); if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not get gateway channel."); + append_err_return(err, "Could not get a gateway channel."); ch_rh->_gw._idx = *((dragonC_UID_t*)gw_channel->header.c_uid); + ch_rh->_gw._rt_idx = dragon_get_local_rt_uid(); } /* TODO: at the moment, there are no data structures associated with the @@ -3409,9 +3501,9 @@ dragon_chsend_send_msg(const dragonChannelSendh_t* ch_sh, const dragonMessage_t* dragonMemoryDescr_t req_mem; err = _channel_from_descr(&ch_sh->_gw, &gw_channel); - if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not resolved gateway channel " - "descriptor while sending a message."); + if (err != DRAGON_SUCCESS) { + append_err_return(err, "Could not resolve gateway channel descriptor while sending a message."); + } /* it is not a local channel so we interact with the gateway channel * instead */ @@ -3582,8 +3674,9 @@ dragon_chrecv_get_msg_blocking(const dragonChannelRecvh_t* ch_rh, dragonMessage_ // providing the NOTIMEOUT constant means providing a NULL pointer to the // bcast in the end_time_ptr pointer below. if (timer->tv_nsec == DRAGON_CHANNEL_BLOCKING_NOTIMEOUT.tv_nsec && - timer->tv_sec == DRAGON_CHANNEL_BLOCKING_NOTIMEOUT.tv_sec) + timer->tv_sec == DRAGON_CHANNEL_BLOCKING_NOTIMEOUT.tv_sec) { timer = NULL; + } timespec_t* end_time_ptr = NULL; timespec_t* remaining_time_ptr = NULL; @@ -3746,7 +3839,7 @@ dragon_chrecv_get_msg_blocking(const dragonChannelRecvh_t* ch_rh, dragonMessage_ err = dragon_channel_gatewaymessage_client_get_cmplt(&gw_msg, msg_recv, ch_rh->_attrs.wait_mode); if (err != DRAGON_SUCCESS) - err_return(err, "non-zero completion of remote get_msg"); + append_err_return(err, "non-zero completion of remote get_msg"); } no_err_return(DRAGON_SUCCESS); @@ -4105,15 +4198,10 @@ dragon_channel_poll(const dragonChannelDescr_t* ch, dragonWaitMode_t wait_mode, dragonGatewayMessageSerial_t gw_ser_msg; dragonMessage_t req_msg; dragonMemoryDescr_t req_mem; - int gw_idx; - err = _get_gw_idx(ch, DRAGON_OP_TYPE_POLL, &gw_idx); + err = _get_gateway(ch, DRAGON_OP_TYPE_POLL, &gw_channel); if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not get a gateway index."); - - err = dragon_ulist_get_by_idx(dg_gateways, gw_idx, (void **) &gw_channel); - if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not get gateway channel."); + append_err_return(err, "Could not get a gateway channel."); timespec_t* end_time_ptr = NULL; timespec_t* remaining_time_ptr = NULL; @@ -4165,7 +4253,9 @@ dragon_channel_poll(const dragonChannelDescr_t* ch, dragonWaitMode_t wait_mode, append_err_return(err, "Could not initialize message to send to " "transport service via gateway channel."); - err = _channel_descr_from_c_uid(*((dragonC_UID_t*)gw_channel->header.c_uid), &gw_descr); + dragonRT_UID_t rt_uid = dragon_get_local_rt_uid(); + + err = _channel_descr_from_uids(rt_uid, *((dragonC_UID_t*)gw_channel->header.c_uid), &gw_descr); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not get gateway channel descriptor."); @@ -4414,7 +4504,7 @@ dragon_channel_register_gateways_from_env() } dragonChannelSerial_t gw_ser; - gw_ser.data = dragon_base64_decode(gw_str, strlen(gw_str), &gw_ser.len); + gw_ser.data = dragon_base64_decode(gw_str, &gw_ser.len); if (gw_ser.data == NULL) { snprintf(err_str, 400, "The environment variable %s was not a valid " @@ -4517,7 +4607,7 @@ dragon_channel_register_gateway(dragonChannelDescr_t* ch) dragon_generate_uuid(channel->proc_gw_sendhid); - err = _register_gateway(channel); + err = _register_gateway(channel, &dg_gateways); if (err != DRAGON_SUCCESS) append_err_return(err, "There was an error registering this channel as a gateway."); @@ -4558,7 +4648,7 @@ dragon_channel_unregister_gateway(dragonChannelDescr_t* ch) err_return(DRAGON_INVALID_ARGUMENT, "Cannot unregister non-local channel as gateway. This " "shouldn't have happened, ever."); - err = _unregister_gateway(channel); + err = _unregister_gateway(channel, dg_gateways); if (err != DRAGON_SUCCESS) append_err_return(err, "Cannot unregister channel as a gateway due to some unknown error."); diff --git a/src/lib/channels_messages.c b/src/lib/channels_messages.c index 2b782b3..ee731b7 100644 --- a/src/lib/channels_messages.c +++ b/src/lib/channels_messages.c @@ -921,6 +921,7 @@ dragon_channel_gatewaymessage_send_create(dragonMemoryPoolDescr_t * pool_descr, err = _assign_gateway_message_header_send(gmsg, target_hostid, deadline, target_ch_ser.len, cleanup_payload_required, msg_nbytes, msg_ser_nbytes, dest_mem_ser_nbytes, return_mode, send_attr->sendhid, mattr.clientid, mattr.hints); + if (err != DRAGON_SUCCESS) { append_err_noreturn("Could not assign values into gateway message header."); goto gwmsg_alloc_fail; @@ -1616,6 +1617,7 @@ dragon_channel_gatewaymessage_transport_get_cmplt(dragonGatewayMessage_t * gmsg, dragonError_t err; dragonMemoryDescr_t msg_mem; dragonMemorySerial_t msg_mem_ser; + dragonMessageAttr_t msg_attrs; if (gmsg == NULL) err_return(DRAGON_INVALID_ARGUMENT, "GatewayMessage cannot be NULL."); @@ -1678,10 +1680,16 @@ dragon_channel_gatewaymessage_transport_get_cmplt(dragonGatewayMessage_t * gmsg, if (err != DRAGON_SUCCESS) append_err_return(err, "Unable to serialize message memory for transport get complete operation."); + err = dragon_channel_message_getattr(msg_recv, &msg_attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Unable to retrieve attributes from message."); + void * obj_ptr = gmsg->_obj_ptr + *(gmsg->_header.dest_mem_descr_ser_offset); memcpy(obj_ptr, msg_mem_ser.data, msg_mem_ser.len); *(gmsg->_header.dest_mem_descr_ser_nbytes) = msg_mem_ser.len; + *(gmsg->_header.send_hints) = msg_attrs.hints; + *(gmsg->_header.send_clientid) = msg_attrs.clientid; } err = dragon_bcast_trigger_all(&gmsg->_cmplt_bcast, NULL, NULL, 0); @@ -1728,6 +1736,7 @@ dragon_channel_gatewaymessage_client_get_cmplt(dragonGatewayMessage_t * gmsg, dr dragonMemoryDescr_t msg_mem; dragonMemorySerial_t msg_mem_ser; dragonError_t get_rc; + dragonMessageAttr_t msg_attrs; if (gmsg == NULL) err_return(DRAGON_INVALID_ARGUMENT, "The gateway message cannot be NULL"); @@ -1756,7 +1765,17 @@ dragon_channel_gatewaymessage_client_get_cmplt(dragonGatewayMessage_t * gmsg, dr if (err != DRAGON_SUCCESS) append_err_return(err, "Could not attach serialized message memory in client get complete gateway operation."); - err = dragon_channel_message_init(msg_recv, &msg_mem, NULL); + err = dragon_channel_message_attr_init(&msg_attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize message attributes in get completion of gateway receive."); + + size_t sz; + dragon_memory_get_size(&msg_mem, &sz); + + msg_attrs.hints = *(gmsg->_header.send_hints); + msg_attrs.clientid = *(gmsg->_header.send_clientid); + + err = dragon_channel_message_init(msg_recv, &msg_mem, &msg_attrs); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not initialize message in get completion of gateway receive."); } else { diff --git a/src/lib/channelsets.c b/src/lib/channelsets.c index 2455dc8..1ab3cb3 100644 --- a/src/lib/channelsets.c +++ b/src/lib/channelsets.c @@ -303,6 +303,15 @@ dragon_channelset_destroy(dragonChannelSetDescr_t * chset_descr) if (err != DRAGON_SUCCESS) append_err_return(err, "Cannot get channelset from descriptor."); + if (!chset->first_poll_call) { + int perr = pthread_join(chset->tid, NULL); + if (perr != 0) { + char err_str[80]; + snprintf(err_str, 80, "There was an error on the pthread_join call. ERR=%d", perr); + err_return(DRAGON_FAILURE, err_str); + } + } + for (int k=0;knum_channels;k++) { dragon_channel_remove_event_bcast(&chset->channels[k].descr, chset->channels[k].token); } @@ -485,11 +494,9 @@ dragonError_t dragon_channelset_poll(dragonChannelSetDescr_t * chset_descr, dragonWaitMode_t wait_mode, timespec_t * timeout, dragonReleaseFun release_fun, void* release_arg, dragonChannelSetEventNotification_t ** event) { - pthread_t tid; pthread_attr_t attr; pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); size_t payload_sz; dragonChannelSet_t * chset; @@ -503,7 +510,7 @@ dragon_channelset_poll(dragonChannelSetDescr_t * chset_descr, dragonWaitMode_t w if (chset->first_poll_call) { chset->first_poll_call = false; - int perr = pthread_create(&tid, &attr, _channelset_sync, (void*)chset); + int perr = pthread_create(&chset->tid, &attr, _channelset_sync, (void*)chset); pthread_attr_destroy(&attr); if (perr != 0) { @@ -668,4 +675,4 @@ dragon_channelset_reset(dragonChannelSetDescr_t* chset_descr) append_err_return(err, "Cannot reset event monitor bcast."); return DRAGON_SUCCESS; -} \ No newline at end of file +} diff --git a/src/lib/ddict.cpp b/src/lib/ddict.cpp new file mode 100644 index 0000000..a3fab4e --- /dev/null +++ b/src/lib/ddict.cpp @@ -0,0 +1,731 @@ +#include +#include "_ddict.h" +#include +#include +#include "err.h" + + +static dragonMap_t * dg_ddict_adapters = NULL; +static dragonMap_t * dg_ddict_reqs = NULL; + +static uint64_t tag = 42; /* Tag is not needed in the messages, but is there for + compatibility should multiple messages need to be + handled simultaneously at some future point. */ + +static dragonError_t +_send_receive(dragonFLIDescr_t* sendto_fli, DragonMsg* send_msg, dragonFLIDescr_t* recvfrom_fli, + DragonResponseMsg** recv_msg, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLISendHandleDescr_t sendh; + dragonFLIRecvHandleDescr_t recvh; + + err = dragon_fli_open_send_handle(sendto_fli, &sendh, NULL, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open send handle."); + + err = dragon_fli_open_recv_handle(recvfrom_fli, &recvh, NULL, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open recv handle."); + + err = send_msg->send(&sendh, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send message."); + + err = dragon_fli_close_send_handle(&sendh, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close send handle."); + + err = recv_fli_msg(&recvh, (DragonMsg**)recv_msg, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not receive response message."); + + err = dragon_fli_close_recv_handle(&recvh, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close receive handle."); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_add_umap_ddict_entry(dragonDDictDescr_t * ddict, dragonDDict_t * new_ddict) +{ + dragonError_t err; + + if (dg_ddict_adapters == NULL) { + dg_ddict_adapters = (dragonMap_t*)malloc(sizeof(dragonMap_t)); + if (dg_ddict_adapters == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "cannot allocate umap for ddict"); + + err = dragon_umap_create(dg_ddict_adapters, DRAGON_DDICT_UMAP_SEED); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to create umap for ddict"); + } + + err = dragon_umap_additem_genkey(dg_ddict_adapters, new_ddict, &ddict->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to insert item into ddict umap"); + + new_ddict->dd_uid = ddict->_idx; + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_ddict_from_descr(const dragonDDictDescr_t * dd_descr, dragonDDict_t ** ddict) +{ + if (dd_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "invalid ddict descriptor"); + + dragonError_t err = dragon_umap_getitem(dg_ddict_adapters, dd_descr->_idx, (void**)ddict); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to find item in ddict umap"); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_add_umap_ddict_req_entry(const dragonDDictRequestDescr_t * descr, const dragonDDictReq_t * new_req) +{ + // dragonError_t err; + + // if (dg_ddict_reqs == NULL) { + // dg_ddict_reqs = malloc(sizeof(dragonMap_t)); + // if (dg_ddict_reqs == NULL) + // err_return(DRAGON_INTERNAL_MALLOC_FAIL, "cannot allocate umap for ddict requests"); + + // err = dragon_umap_create(dg_ddict_reqs, DRAGON_DDICT_UMAP_SEED); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "failed to create umap for ddict requests"); + // } + + // err = dragon_umap_additem(dg_ddict_reqs, descr->_idx, new_req); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "failed to insert item into ddict request umap"); + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +static dragonError_t +_ddict_req_from_descr(const dragonDDictRequestDescr_t * req_descr, dragonDDictReq_t ** req) +{ + // if (req_descr == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "invalid ddict request descriptor"); + + // dragonError_t err = dragon_umap_getitem(dg_ddict_reqs, req_descr->_idx, (void*)req); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "failed to find item in ddict request umap"); + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; + +} + +static dragonError_t +_send_buffered_bytes(dragonDDict_t * ddict) +{ + // dragonMemoryDescr_t mem_descr; + // void * mem_ptr; + // void * dest_ptr; + // dragonDDictBufAlloc_t * node; + // dragonDDictBufAlloc_t * prev; + + // // TODO: DDict needs a way to + // dragonError_t err = dragon_memory_alloc(&mem_descr, NULL, ddict->total_send_buffer); + + return DRAGON_NOT_IMPLEMENTED; +} + +static dragonError_t +_buffer_bytes(dragonDDictReq_t * req, uint8_t * bytes, size_t num_bytes) +{ + // void * data_ptr; + // dragonDDictBufAlloc_t * node_ptr; + + // if (num_bytes > 0) { + // data_ptr = malloc(num_bytes); + // if (data_ptr == NULL) + // err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate buffer space -- OOM"); + + // node_ptr = malloc(sizeof(dragonDDictBufAlloc_t)); + // if (node_ptr == NULL) + // err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate node pointer -- OOM"); + + // memcpy(data_ptr, bytes, num_bytes); + + // node_ptr->data = data_ptr; + // node_ptr->num_bytes = num_bytes; + // req->buffer_size += num_bytes; + // node_ptr->next = req->buffered_allocs; + // req->buffered_allocs = node_ptr; + // } + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; + +} + +static dragonError_t +_send_key(dragonDDictReq_t * req) +{ + // Buffer key into one blob and store it, then send it + // dragonDDictBufAlloc_t * node = req->buffered_allocs; + // size_t key_size = req->buffer_size; + // req->buffered_allocs = NULL; + + // // We have to order this backwards, so jump to the end of the first memcpy start + // void * data = (void*)malloc(key_size); + // void * dst_ptr = data + (key_size - node->num_bytes); + // while (node != NULL) { + // memcpy(dst_ptr, node->data, node->num_bytes); + // free(node->data); + // dragonDDictBufAlloc_t * tmp = node->next; + // free(node); + // node = tmp; + // if (node != NULL) { + // dst_ptr = dst_ptr - node->num_bytes; + // } + // } + + // req->key_data = data; + // // Once our key is constructed, hash it + // req->key_hash = dragon_hash(data, key_size); + // dragonDDictDescr_t descr; + // descr._idx = req->dd_uid; + // dragonDDict_t * ddict; + // dragonError_t err = _ddict_from_descr(&descr, &ddict); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to retrieve ddict"); + // dragonFLIDescr_t fli = ddict->manager_flis[req->key_hash % ddict->num_managers]; + + // err = dragon_fli_open_send_handle(&fli, &req->sendh, NULL, NULL); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to open send handle"); + + // uint64_t unused_arg; + // err = dragon_fli_send_bytes(&req->sendh, key_size, req->key_data, &unused_arg, false, NULL); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to send key to manager FLI"); + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_serialize(const dragonDDictDescr_t * dd, dragonDDictSerial_t * dd_ser) +{ + // if (dd == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "invalid ddict descriptor"); + + // if (dd_ser == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "invalid ddict serial descriptor"); + + // dd_ser->len = 0; + // dd_ser->data = NULL; + + // dragonDDict_t * ddict; + // dragonError_t err = _ddict_from_descr(dd, &ddict); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "invalid ddict descriptor"); + + // dd_ser->len = ddict->ser.len // actually FLI serial data + // dd_ser->data = malloc(dd_ser->len); + // if (dd_ser->data == NULL) + // err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for serialized descriptor."); + + // memcpy(dd_ser->data, ddict->ser.data, ddict->ser.len); + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_serial_free(dragonDDictSerial_t * dd_ser) +{ + // if (dd_ser == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "Invalid ddict serial descriptor"); + + // if (dd_ser->data != NULL) + // free(dd_ser->data); + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_attach_b64(char* b64_str, dragonDDictDescr_t* obj, const timespec_t* timeout) +{ + // dragonDDictSerial_t serial; + // dragonError_t err; + + // serial.data = dragon_base64_decode(b64_str, &serial.len); + // err = dragon_ddict_attach(&serial, obj, timeout); + + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Could not attach to distributed dictionary."); + + // err = dragon_ddict_serial_free(&serial); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Could not free the serialized descriptor."); + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_attach(const dragonDDictSerial_t * dd_ser, dragonDDictDescr_t * dd, const timespec_t * timeout) +{ + dragonError_t err; + dragonChannelDescr_t resp_ch; + dragonChannelDescr_t buffered_resp_ch; + dragonFLISerial_t ser_mgr_fli; + dragonFLISerial_t ser_resp_fli; + dragonFLISerial_t ser_buffered_resp_fli; + dragonFLISerial_t fli_ser; + DragonResponseMsg* resp_msg; + DDRegisterClientResponseMsg* registerResponse; + + if (dd == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "invalid ddict descriptor"); + + if (dd_ser == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "invalid serialized ddict descriptor"); + + if (dd_ser->data == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "invalid serialized ddict descriptor"); + + // attach + dragonDDict_t* ddict = (dragonDDict_t*)malloc(sizeof(dragonDDict_t)); + if (ddict == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "unable to allocate new ddict structure"); + + // Attach to primary FLI adapter + fli_ser.data = dd_ser->data; + fli_ser.len = dd_ser->len; + err = dragon_fli_attach(&fli_ser, NULL, &(ddict->orchestrator_fli)); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to attach to FLI adapter"); + + // Copy in the serialized descriptor. + ddict->ser.len = dd_ser->len; + ddict->ser.data = (uint8_t*) malloc(dd_ser->len); + if (ddict->ser.data == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for internal serialized descriptor"); + memcpy(ddict->ser.data, dd_ser->data, dd_ser->len); + + /* Get the return channels for the client. This includes a streaming and a buffered return channel. + Streaming requires one extra message per conversation but allow value data to be streamed back + to the client. */ + + err = dragon_create_process_local_channel(&resp_ch, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create response channel."); + + err = dragon_create_process_local_channel(&buffered_resp_ch, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create buffered response channel."); + + err = dragon_fli_create(&ddict->respFLI, &resp_ch, NULL, NULL, 0, NULL, false, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create response fli."); + + err = dragon_fli_create(&ddict->bufferedRespFLI, &buffered_resp_ch, NULL, NULL, 0, NULL, true, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create buffered response fli."); + + err = dragon_fli_serialize(&ddict->respFLI, &ser_resp_fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize respFLI."); + + err = dragon_fli_serialize(&ddict->bufferedRespFLI, &ser_buffered_resp_fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize bufferedRespFLI."); + + ddict->respFLIStr = dragon_base64_encode(ser_resp_fli.data, ser_resp_fli.len); + ddict->bufferedRespFLIStr = dragon_base64_encode(ser_buffered_resp_fli.data, ser_buffered_resp_fli.len); + + err = dragon_fli_serial_free(&ser_resp_fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free serialized response fli."); + + err = dragon_fli_serial_free(&ser_buffered_resp_fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free serialized buffered response fli."); + + DDRegisterClientMsg registerClient(tag, ddict->respFLIStr, ddict->bufferedRespFLIStr); + + err = _send_receive(&ddict->orchestrator_fli, ®isterClient, &ddict->bufferedRespFLI, &resp_msg, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send the register client message and receive response."); + + if (resp_msg->tc() != DDRegisterClientResponseMsg::TC) + err_return(DRAGON_FAILURE, "Did not get expected register client response message."); + + registerResponse = (DDRegisterClientResponseMsg*) resp_msg; + + if (registerResponse->err() != DRAGON_SUCCESS) + err_return(registerResponse->err(), registerResponse->errInfo()); + + ddict->clientID = registerResponse->clientID(); + ddict->num_managers = registerResponse->numManagers(); + + // ddict->manager_flis = (dragonFLIDescr_t*) malloc(sizeof(dragonFLIDescr_t) * ddict->num_managers); + // if (ddict->manager_flis != NULL) + // err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for manager flis."); + + // for (size_t k=0;knum_managers;k++) { + // ser_mgr_fli.data = dragon_base64_decode(registerResponse->managerFLI(k), &ser_mgr_fli.len); + + // err = dragon_fli_attach(&ser_mgr_fli, NULL, &ddict->manager_flis[k]); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Could not attach to manager fli."); + + // err = dragon_fli_serial_free(&ser_mgr_fli); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Could not free serialized fli for the manager."); + // } + + registerResponse = NULL; + delete resp_msg; + + /* TODO: We could eventually send all these requests and THEN wait for all the responses. + Without hundreds of managers this shouldn't be a big deal. Once hundreds are needed, we + should revisit this. */ + for (uint64_t k=0;knum_managers;k++) { + DDRegisterClientIDMsg registerWithManager(tag, ddict->clientID, ddict->respFLIStr, ddict->bufferedRespFLIStr); + + err = _send_receive(&ddict->manager_flis[k], ®isterWithManager, &ddict->bufferedRespFLI, &resp_msg, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send the register client message and receive response."); + + if (resp_msg->tc() != DDRegisterClientIDResponseMsg::TC) + err_return(DRAGON_FAILURE, "Did not get correct response message."); + + if (resp_msg->err() != DRAGON_SUCCESS) + err_return(resp_msg->err(), resp_msg->errInfo()); + + delete resp_msg; + } + + err = _add_umap_ddict_entry(dd, ddict); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to add new ddict entry to umap"); + + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_detach(dragonDDictDescr_t * descr, const timespec_t * timeout) +{ + /* TODO: Deregister the client and DETACH FROM ALL FLIS */ + dragonDDict_t * ddict; + dragonError_t err = _ddict_from_descr(descr, &ddict); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid ddict descriptor"); + + // Detach from FLI + err = dragon_fli_detach(&(ddict->respFLI)); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to detach from FLI adapter"); + + // Free any local resources on the handle + + return DRAGON_NOT_IMPLEMENTED; +} + +/* + General procedure: + - Create request + - Use write_bytes to write key (always happens) + - Call operator (put, get, contains) + - This should send a signal to the orchestrator + - This should also send the buffered key data + - Call appropriate following calls (write_bytes, read_bytes, read_mem) + - Finalize request, sending appropriate signal to orchestrator we are done + - Wait on success/fail reply +*/ + +dragonError_t +dragon_ddict_create_request(dragonDDictDescr_t * descr, dragonDDictRequestDescr_t * req_descr) +{ + // // Generate a request to do read/write operations to the dictionary + // // Once accepted, perform operations, then finalize request + + // if (descr == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "Invalid ddict descriptor"); + + // if (req_descr == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "Invalid ddict request descriptor"); + + // // Validate ddict exists + // dragonDDict_t * ddict; + // dragonError_t err = _ddict_from_descr(descr, &ddict); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "invalid ddict descriptor"); + + // dragonDDictReq_t * req; + // // Check if req exists in umap + // err = _ddict_req_from_descr(req_descr, &req); + // if (err == DRAGON_SUCCESS) + // err_return(DRAGON_INVALID_ARGUMENT, "Request already exists, cannot overwrite"); + + // // If not, add to umap + // req = malloc(sizeof(dragonDDictReq_t)); + // err = _add_umap_ddict_req_entry(req_descr, req); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to add new request entry"); + + // req->dd_uid = ddict->dd_uid; + // req->key_data = NULL; + // req->key_hash = 0UL; + // req->op_type = DRAGON_DDICT_NO_OP; + + // // TODO: Ping the orchestrator with a message confirming our request + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_finalize_request(dragonDDictRequestDescr_t * req_descr, const timespec_t * timeout) +{ + // // Finalize a request to do read/write operations + // // This lets the orchestrator know the client is done for now + // if (req_descr == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "Invalid request descriptor"); + + // dragonDDictReq_t * req; + // dragonError_t err = _ddict_req_from_descr(req_descr, &req); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to find request object"); + + // switch(req->op_type) { + // case DRAGON_DDICT_NO_OP: + // case DRAGON_DDICT_FINALIZED: + // err_return(DRAGON_INVALID_OPERATION, "Request is invalid"); + // break; + + // case DRAGON_DDICT_CONTAINS_REQ: + // { + // //Error check to see if key is present? + // err = _send_key(req); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to send key to manager"); + + // // auto send_msg = msgDDictContains() + // // err = dragon_fli_send_bytes(&req->sendh, send_msg.size, send_msg.bytes, arg, false, NULL); + // // err = dragon_fli_close_send_handle(&req->sendh, NULL); + + // } + // break; + + // case DRAGON_DDICT_GET_REQ: + // { + // // auto send_msg = msgDDictFinishRead + // // err = dragon_fli_send_bytes(&req->sendh, send_msg.size, send_msg.bytes, arg, false, NULL); + // // err = dragon_fli_close_send_handle(&req->sendh, NULL); + // } + // break; + + // case DRAGON_DDICT_PUT_REQ: + // { + // // auto send_msg = msgDDictFinishWrite + // // err = dragon_fli_send_bytes(&req->sendh, send_msg.size, send_msg.bytes, arg, false, NULL); + // // err = dragon_fli_close_send_handle(&req->sendh, NULL); + // } + // break; + + // default: + // err_return(DRAGON_INVALID_ARGUMENT, "Unimplemented or invalid operator type"); + // } + + // // Get response message + // size_t recv_sz; + // char * recv_bytes; + // uint64_t arg; + // err = dragon_fli_recv_bytes(&req->recvh, 0, &recv_sz, &recv_bytes, &arg, NULL); + // // if (err != DRAGON_SUCCESS) + + // // Check response (arg value?) + + // req->op_type = DRAGON_DDICT_FINALIZED; + // free(req->key_data); // Free malloc'd key + // dragon_umap_delitem(dg_ddict_reqs, req_descr->_idx); // Remove request from umap + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_write_bytes(dragonDDictRequestDescr_t * req_descr, size_t num_bytes, uint8_t * bytes, const timespec_t * timeout) +{ + // if (req_descr == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "invalid request descriptor"); + + // dragonDDictReq_t * req; + // dragonError_t err = _ddict_req_from_descr(req_descr, &req); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to find request object"); + + // // Buffer key writes + // if (req->key_data == NULL) { + // err = _buffer_bytes(req, bytes, num_bytes); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to buffer key message"); + + // } else { + // if (req->op_type != DRAGON_DDICT_PUT_REQ) + // err_return(DRAGON_INVALID_OPERATION, "Trying to perform a write operation with a non-write request"); + + // // Write data out normally, key is done + // uint64_t arg; + // err = dragon_fli_send_bytes(&req->sendh, num_bytes, bytes, &arg, false, timeout); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to write bytes to ddict"); + // } + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_read_bytes(dragonDDictRequestDescr_t* req_descr, size_t requested_size, + size_t* received_size, uint8_t** bytes, const timespec_t* timeout) +{ + if (req_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid request descriptor"); + + dragonDDictReq_t * req; + dragonError_t err = _ddict_req_from_descr(req_descr, &req); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to find request object"); + + // Operation type is set in the put/get/contains/etc call after setting the key + if (req->op_type != DRAGON_DDICT_GET_REQ) + err_return(DRAGON_INVALID_OPERATION, "Invalid operation type"); + + uint64_t arg; + err = dragon_fli_recv_bytes(&req->recvh, requested_size, received_size, bytes, &arg, timeout); + if (err != DRAGON_SUCCESS) { + if (err == DRAGON_EOT) + return err; + + append_err_return(err, "Failed to read bytes from dictionary"); + } + + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_ddict_read_bytes_into(dragonDDictRequestDescr_t* req, size_t requested_size, + size_t* received_size, uint8_t* bytes, const timespec_t* timeout) +{ + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_read_mem(dragonDDictRequestDescr_t* req_descr, dragonMemoryDescr_t* mem_descr) +{ + // if (req_descr == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "Invalid request descriptor"); + + // if (mem_descr == NULL) + // err_return(DRAGON_INVALID_ARGUMENT, "Invalid memory descriptor"); + + // dragonDDictReq_t * req; + // dragonError_t err = _ddict_req_from_descr(req_descr, &req); + // if (err != DRAGON_SUCCESS) + // append_err_return(err, "Failed to find request object"); + + // if (req->op_type != DRAGON_DDICT_GET_REQ) + // err_return(DRAGON_INVALID_OPERATION, "Invalid operation type"); + + // uint64_t arg; + // err = dragon_fli_recv_mem(&req->recvh, mem_descr, &arg, NULL); + // if (err != DRAGON_SUCCESS) { + // if (err == DRAGON_EOT) + // return err; + + // append_err_return(err, "Failed to receive into memory descriptor"); + // } + + // no_err_return(DRAGON_SUCCESS); + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_contains(dragonDDictRequestDescr_t * req_descr) +{ + // Check if provided key exists in the ddict + if (req_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid request descriptor"); + + dragonDDictReq_t * req; + dragonError_t err = _ddict_req_from_descr(req_descr, &req); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to find request object"); + + // key_data means we've already written and sent it, out-of-order operation + if (req->key_data != NULL) + err_return(DRAGON_INVALID_OPERATION, "Key has already been sent, invalid operation order"); + + req->op_type = DRAGON_DDICT_CONTAINS_REQ; // So we know what to finalize + + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_get(dragonDDictRequestDescr_t* req_descr, const timespec_t* timeout) +{ + if (req_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid request descriptor"); + + dragonDDictReq_t * req; + dragonError_t err = _ddict_req_from_descr(req_descr, &req); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not find valid request object"); + + if (req->key_data == NULL && req->buffered_allocs == NULL) + err_return(DRAGON_INVALID_OPERATION, "No data present in request"); + + if (req->key_data == NULL && req->buffered_allocs != NULL) { + err = _send_key(req); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to send key"); + } else { + err_return(DRAGON_INVALID_OPERATION, "What goes here?"); + } + + req->op_type = DRAGON_DDICT_GET_REQ; + + return DRAGON_NOT_IMPLEMENTED; +} + +dragonError_t +dragon_ddict_put(dragonDDictRequestDescr_t* req_descr, const timespec_t* timeout) +{ + if (req_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid request descriptor"); + + dragonDDictReq_t * req; + dragonError_t err = _ddict_req_from_descr(req_descr, &req); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not find valid request object"); + + if (req->key_data == NULL && req->buffered_allocs == NULL) + err_return(DRAGON_INVALID_OPERATION, "No data present in request"); + + if (req->key_data == NULL && req->buffered_allocs != NULL) { + err = _send_key(req); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to send key"); + } else { + err_return(DRAGON_INVALID_OPERATION, "What goes here?"); + } + + req->op_type = DRAGON_DDICT_PUT_REQ; + + return DRAGON_NOT_IMPLEMENTED; +} \ No newline at end of file diff --git a/src/lib/dictionary.cpp b/src/lib/dictionary.cpp new file mode 100644 index 0000000..3d18f2d --- /dev/null +++ b/src/lib/dictionary.cpp @@ -0,0 +1,47 @@ +#include +#include + +class SerializableInt : public DDictSerializable { + public: + SerializableInt(); + SerializableInt(int x); + virtual void serialize(dragonDDictRequestDescr_t* req, const timespec_t* timeout); + virtual void deserialize(dragonDDictRequestDescr_t* req, const timespec_t* timeout); + int getVal() const; + private: + int val; +}; + +SerializableInt::SerializableInt(): val(0) {} +SerializableInt::SerializableInt(int x): val(x) {} + +void SerializableInt::serialize(dragonDDictRequestDescr_t* req, const timespec_t* timeout) +{ + dragonError_t err; + err = dragon_ddict_write_bytes(req, sizeof(int), (uint8_t*)&val, timeout); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); +} + +void SerializableInt::deserialize(dragonDDictRequestDescr_t* req, const timespec_t* timeout) +{ + dragonError_t err; + size_t actual_size; + + err = dragon_ddict_read_bytes_into(req, sizeof(int), &actual_size, (uint8_t*)&val, timeout); + if (err != DRAGON_SUCCESS) + throw DragonError(err, dragon_getlasterrstr()); + if (actual_size != sizeof(int)) + throw DragonError(DRAGON_INVALID_ARGUMENT, "The size of the integer was not correct."); +} + +int SerializableInt::getVal() const {return val;} + +void testit(char* ser) { + SerializableInt x(6); + SerializableInt y(42); + DDict test(ser, NULL); + test[x] = y; + SerializableInt z = test[x]; + printf("%d\n", z.getVal()); +} \ No newline at end of file diff --git a/src/lib/fli.c b/src/lib/fli.c index 99b03c9..f5cad8e 100644 --- a/src/lib/fli.c +++ b/src/lib/fli.c @@ -2,6 +2,7 @@ #include "_fli.h" #include "err.h" #include "umap.h" + #include #include #include @@ -33,9 +34,7 @@ typedef struct _ReceiverArg_st { } _ReceiverArg_t; /* obtain an fli structure from a given adapter descriptor */ -static dragonError_t -_fli_from_descr(const dragonFLIDescr_t* adapter, dragonFLI_t** fli) -{ +static dragonError_t _fli_from_descr(const dragonFLIDescr_t* adapter, dragonFLI_t** fli) { if (adapter == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); @@ -48,9 +47,7 @@ _fli_from_descr(const dragonFLIDescr_t* adapter, dragonFLI_t** fli) } /* obtain an fli structure from a given send handle descriptor */ -static dragonError_t -_fli_sendh_from_descr(const dragonFLISendHandleDescr_t* send_descr, dragonFLISendHandle_t** send_handle) -{ +static dragonError_t _fli_sendh_from_descr(const dragonFLISendHandleDescr_t* send_descr, dragonFLISendHandle_t** send_handle) { if (send_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); @@ -63,9 +60,7 @@ _fli_sendh_from_descr(const dragonFLISendHandleDescr_t* send_descr, dragonFLISen } /* obtain an fli structure from a given recv handle descriptor */ -static dragonError_t -_fli_recvh_from_descr(const dragonFLIRecvHandleDescr_t* recv_descr, dragonFLIRecvHandle_t** recv_handle) -{ +static dragonError_t _fli_recvh_from_descr(const dragonFLIRecvHandleDescr_t* recv_descr, dragonFLIRecvHandle_t** recv_handle) { if (recv_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli recv handle descriptor"); @@ -78,9 +73,7 @@ _fli_recvh_from_descr(const dragonFLIRecvHandleDescr_t* recv_descr, dragonFLIRec } /* insert an fli structure into the unordered map using the adapter->_idx as the key */ -static dragonError_t -_add_umap_fli_entry(dragonFLIDescr_t* adapter, const dragonFLI_t* fli) -{ +static dragonError_t _add_umap_fli_entry(dragonFLIDescr_t* adapter, const dragonFLI_t* fli) { dragonError_t err; /* register this channel in our umap */ @@ -104,9 +97,7 @@ _add_umap_fli_entry(dragonFLIDescr_t* adapter, const dragonFLI_t* fli) } /* insert an fli send handle structure into the unordered map using the send_descr->_idx as the key */ -static dragonError_t -_add_umap_fli_sendh_entry(dragonFLISendHandleDescr_t* send_descr, const dragonFLISendHandle_t* send_handle) -{ +static dragonError_t _add_umap_fli_sendh_entry(dragonFLISendHandleDescr_t* send_descr, const dragonFLISendHandle_t* send_handle) { dragonError_t err; /* register this channel in our umap */ @@ -130,9 +121,7 @@ _add_umap_fli_sendh_entry(dragonFLISendHandleDescr_t* send_descr, const dragonFL } /* insert an fli recv handle structure into the unordered map using the recv_descr->_idx as the key */ -static dragonError_t -_add_umap_fli_recvh_entry(dragonFLIRecvHandleDescr_t* recv_descr, const dragonFLIRecvHandle_t* recv_handle) -{ +static dragonError_t _add_umap_fli_recvh_entry(dragonFLIRecvHandleDescr_t* recv_descr, const dragonFLIRecvHandle_t* recv_handle) { dragonError_t err; /* register this channel in our umap */ @@ -156,20 +145,19 @@ _add_umap_fli_recvh_entry(dragonFLIRecvHandleDescr_t* recv_descr, const dragonFL } -static dragonError_t -_validate_attr(const dragonFLIAttr_t* attr) -{ +static dragonError_t _validate_attr(const dragonFLIAttr_t* attr) { return DRAGON_NOT_IMPLEMENTED; } -static dragonError_t -_send_mem(dragonChannelSendh_t* sendh, dragonMemoryDescr_t* mem, uint64_t arg, timespec_t* deadline) -{ +static dragonError_t _send_mem(dragonChannelSendh_t* sendh, dragonMemoryDescr_t* mem, uint64_t arg, + bool transfer_ownership, timespec_t* deadline) { + dragonError_t err; timespec_t remaining_time; timespec_t* timeout = NULL; dragonMessage_t msg; dragonMessageAttr_t msg_attrs; + dragonMemoryDescr_t* ownership = DRAGON_CHANNEL_SEND_TRANSFER_OWNERSHIP; if (sendh == NULL) err_return(DRAGON_INVALID_ARGUMENT, "You must provide a channel send handle to send a message."); @@ -189,12 +177,14 @@ _send_mem(dragonChannelSendh_t* sendh, dragonMemoryDescr_t* mem, uint64_t arg, t append_err_return(err, "Failed to init message attr structure."); msg_attrs.hints = arg; + if (!transfer_ownership) + ownership = NULL; err = dragon_channel_message_init(&msg, mem, &msg_attrs); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not initialize serialized stream channel message."); - err = dragon_chsend_send_msg(sendh, &msg, DRAGON_CHANNEL_SEND_TRANSFER_OWNERSHIP, timeout); + err = dragon_chsend_send_msg(sendh, &msg, ownership, timeout); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not add serialized stream channel to manager channel."); @@ -209,9 +199,8 @@ _send_mem(dragonChannelSendh_t* sendh, dragonMemoryDescr_t* mem, uint64_t arg, t no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_send_bytes(dragonChannelSendh_t* chan_sendh, dragonMemoryPoolDescr_t* pool, uint8_t* bytes, size_t num_bytes, uint64_t arg, timespec_t* deadline) -{ +static dragonError_t _send_bytes(dragonChannelSendh_t* chan_sendh, dragonMemoryPoolDescr_t* pool, uint8_t* bytes, + size_t num_bytes, uint64_t arg, timespec_t* deadline) { dragonError_t err; dragonMemoryDescr_t mem_descr; void* mem_ptr; @@ -222,7 +211,17 @@ _send_bytes(dragonChannelSendh_t* chan_sendh, dragonMemoryPoolDescr_t* pool, uin if (bytes == NULL && num_bytes != 0) err_return(DRAGON_INVALID_ARGUMENT, "You must provide bytes when sending a non-zero number of bytes."); - err = dragon_memory_alloc(&mem_descr, pool, num_bytes); + timespec_t* timeout = NULL; + timespec_t remaining_time; + + if (deadline != NULL) { + timeout = &remaining_time; + err = dragon_timespec_remaining(deadline, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute remaining time"); + } + + err = dragon_memory_alloc_blocking(&mem_descr, pool, num_bytes, timeout); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not get shared memory for message data."); @@ -233,16 +232,14 @@ _send_bytes(dragonChannelSendh_t* chan_sendh, dragonMemoryPoolDescr_t* pool, uin memcpy(mem_ptr, bytes, num_bytes); } - err = _send_mem(chan_sendh, &mem_descr, arg, deadline); + err = _send_mem(chan_sendh, &mem_descr, arg, true, deadline); if (err != DRAGON_SUCCESS) append_err_return(err, "Error when calling internal _send_mem."); no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_send_buffered_bytes(dragonFLISendHandle_t* sendh, timespec_t* deadline) -{ +static dragonError_t _send_buffered_bytes(dragonFLISendHandle_t* sendh, timespec_t* deadline) { dragonError_t err; dragonMemoryDescr_t mem_descr; void* mem_ptr; @@ -250,8 +247,17 @@ _send_buffered_bytes(dragonFLISendHandle_t* sendh, timespec_t* deadline) dragonFLISendBufAlloc_t* node; dragonFLISendBufAlloc_t* prev; + timespec_t* timeout = NULL; + timespec_t remaining_time; - err = dragon_memory_alloc(&mem_descr, &sendh->adapter->pool, sendh->total_bytes); + if (deadline != NULL) { + timeout = &remaining_time; + err = dragon_timespec_remaining(deadline, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Send buffered bytes timed out before sending."); + } + + err = dragon_memory_alloc_blocking(&mem_descr, &sendh->adapter->pool, sendh->total_bytes, timeout); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not get shared memory for message data."); @@ -267,14 +273,19 @@ _send_buffered_bytes(dragonFLISendHandle_t* sendh, timespec_t* deadline) memcpy(dest_ptr, node->data, node->num_bytes); prev = node; node = node->next; - free(prev->data); + if (prev->data != NULL) { + free(prev->data); + prev->data = NULL; + } + free(prev); + prev = NULL; } if (dest_ptr != mem_ptr) err_return(DRAGON_INVALID_OPERATION, "There was an error while unbuffering data in send operation."); - err = _send_mem(&sendh->chan_sendh, &mem_descr, sendh->buffered_arg, deadline); + err = _send_mem(&sendh->chan_sendh, &mem_descr, sendh->buffered_arg, true, deadline); if (err != DRAGON_SUCCESS) append_err_return(err, "Error when calling internal _send_mem."); @@ -285,8 +296,7 @@ _send_buffered_bytes(dragonFLISendHandle_t* sendh, timespec_t* deadline) no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_buffer_bytes(dragonFLISendHandle_t* sendh, uint8_t* bytes, size_t num_bytes, uint64_t arg) { +static dragonError_t _buffer_bytes(dragonFLISendHandle_t* sendh, uint8_t* bytes, size_t num_bytes, uint64_t arg) { void* data_ptr; dragonFLISendBufAlloc_t* node_ptr; @@ -315,12 +325,19 @@ _buffer_bytes(dragonFLISendHandle_t* sendh, uint8_t* bytes, size_t num_bytes, ui no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_recv_mem(dragonChannelRecvh_t* recvh, dragonMemoryDescr_t* mem, uint64_t* arg, timespec_t* deadline) -{ +static dragonError_t _recv_mem(dragonChannelRecvh_t* recvh, dragonMemoryDescr_t* mem, uint64_t* arg, timespec_t* deadline) { dragonError_t err; dragonMessage_t msg; dragonMessageAttr_t attrs; + timespec_t* timeout = NULL; + timespec_t remaining_time; + + if (deadline != NULL) { + timeout = &remaining_time; + err = dragon_timespec_remaining(deadline, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute remaining time"); + } if (recvh == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Must provide non-null receive handle."); @@ -335,7 +352,7 @@ _recv_mem(dragonChannelRecvh_t* recvh, dragonMemoryDescr_t* mem, uint64_t* arg, if (err != DRAGON_SUCCESS) append_err_return(err, "Could not initialize message structure."); - err = dragon_chrecv_get_msg_blocking(recvh, &msg, deadline); + err = dragon_chrecv_get_msg_blocking(recvh, &msg, timeout); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not receive memory from channel."); @@ -356,9 +373,8 @@ _recv_mem(dragonChannelRecvh_t* recvh, dragonMemoryDescr_t* mem, uint64_t* arg, no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_recv_bytes_into(dragonChannelRecvh_t* recvh, uint8_t** data, size_t* num_bytes, uint64_t* arg, timespec_t* deadline) -{ +static dragonError_t _recv_bytes_into(dragonChannelRecvh_t* recvh, uint8_t** data, size_t* num_bytes, + uint64_t* arg, timespec_t* deadline) { dragonError_t err; dragonMemoryDescr_t mem; void* mem_ptr; @@ -395,12 +411,15 @@ _recv_bytes_into(dragonChannelRecvh_t* recvh, uint8_t** data, size_t* num_bytes, } else *data = NULL; + err = dragon_memory_free(&mem); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free memory."); + no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_recv_bytes_buffered(dragonFLIRecvHandle_t* recvh, size_t requested_size, size_t* received_size, uint8_t** data, uint64_t* arg, timespec_t* deadline) -{ +static dragonError_t _recv_bytes_buffered(dragonFLIRecvHandle_t* recvh, size_t requested_size, + size_t* received_size, uint8_t** data, uint64_t* arg, timespec_t* deadline) { dragonError_t err = DRAGON_SUCCESS; void* src_ptr = NULL; void* dest_ptr = NULL; @@ -522,10 +541,8 @@ _recv_bytes_buffered(dragonFLIRecvHandle_t* recvh, size_t requested_size, size_t no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_recv_bytes_common(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, - size_t* received_size, uint8_t** bytes, uint64_t* arg, const timespec_t* timeout) -{ +static dragonError_t _recv_bytes_common(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t** bytes, uint64_t* arg, const timespec_t* timeout) { dragonError_t err; dragonFLIRecvHandle_t* recvh_obj; timespec_t* deadline = NULL; @@ -559,7 +576,7 @@ _recv_bytes_common(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_siz recvh_obj->num_bytes_received += *received_size; - if (*received_size == 0 && *arg == FLI_EOT) { + if (*arg == FLI_EOT) { recvh_obj->stream_received = true; *arg = 0; /* FLI_EOT is internal only so don't expose it. */ no_err_return(DRAGON_EOT); @@ -572,9 +589,8 @@ _recv_bytes_common(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_siz no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_send_stream_channel(const dragonChannelDescr_t* strm_ch, const dragonChannelDescr_t* to_chan, dragonMemoryPoolDescr_t* pool, timespec_t* deadline) -{ +static dragonError_t _send_stream_channel(const dragonChannelDescr_t* strm_ch, const dragonChannelDescr_t* to_chan, + dragonMemoryPoolDescr_t* pool, timespec_t* deadline) { dragonError_t err; dragonChannelSerial_t ser; dragonChannelSendh_t sendh; @@ -615,9 +631,7 @@ _send_stream_channel(const dragonChannelDescr_t* strm_ch, const dragonChannelDes no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_recv_stream_channel(dragonChannelDescr_t* from_chan, dragonChannelDescr_t* strm_ch, timespec_t* deadline) -{ +static dragonError_t _recv_stream_channel(dragonChannelDescr_t* from_chan, dragonChannelDescr_t* strm_ch, timespec_t* deadline) { dragonError_t err; dragonChannelSerial_t ser; dragonChannelRecvh_t recvh; @@ -659,9 +673,7 @@ _recv_stream_channel(dragonChannelDescr_t* from_chan, dragonChannelDescr_t* strm no_err_return(DRAGON_SUCCESS); } -static dragonError_t -_empty_the_channel(dragonChannelDescr_t* channel) -{ +static dragonError_t _empty_the_channel(dragonChannelDescr_t* channel) { dragonError_t err; dragonChannelRecvh_t recvh; timespec_t deadline = {0,0}; @@ -698,9 +710,7 @@ _empty_the_channel(dragonChannelDescr_t* channel) } -static void* -_from_fd_to_fli (void* ptr) -{ +static void* _from_fd_to_fli (void* ptr) { dragonError_t err; uint8_t* buffer; size_t num_bytes = 0; @@ -713,6 +723,7 @@ _from_fd_to_fli (void* ptr) err = DRAGON_INTERNAL_MALLOC_FAIL; /* err might be logged eventually. */ fprintf(stderr, "ERROR: The chunk size of %lu could not be allocated for sending (ERR=%s).", arg->chunk_size, dragon_get_rc_string(err)); + fflush(stderr); return NULL; } @@ -725,24 +736,26 @@ _from_fd_to_fli (void* ptr) /* err might be logged eventually. But no way to return the error to user. They will see a problem with the file descriptor. */ fprintf(stderr, "ERROR: There was an error sending bytes through the fli interface (ERR=%s).\n", dragon_get_rc_string(err)); + fflush(stderr); } close(fd); free(buffer); + buffer = NULL; if (arg->buffer) { err = dragon_fli_send_bytes(arg->sendh, 0, NULL, 0, false, NULL); - if (err != DRAGON_SUCCESS) + if (err != DRAGON_SUCCESS) { fprintf(stderr, "ERROR: Could not flush the buffered bytes from the file descriptor thread helper."); + fflush(stderr); + } } pthread_exit(NULL); } -static void* -_from_fli_to_fd (void* ptr) -{ +static void* _from_fli_to_fd (void* ptr) { dragonError_t err; uint8_t* buffer; uint64_t recv_arg; @@ -757,11 +770,13 @@ _from_fli_to_fd (void* ptr) written_bytes += write(arg->fd, &buffer[written_bytes], num_bytes - written_bytes); free(buffer); + buffer = NULL; } if (err != DRAGON_EOT) { /* err might be logged eventually. */ fprintf(stderr, "ERROR: There was an error receiving data from the fli interface (ERR=%s).\n", dragon_get_rc_string(err)); + fflush(stderr); } close(fd); @@ -772,21 +787,17 @@ _from_fli_to_fd (void* ptr) /* Beginning of user API */ /****************************************************************************************/ -dragonError_t -dragon_fli_attr_init(dragonFLIAttr_t* attr) -{ +dragonError_t dragon_fli_attr_init(dragonFLIAttr_t* attr) { attr->_placeholder = 0; return DRAGON_SUCCESS; } -dragonError_t -dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, +dragonError_t dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, dragonChannelDescr_t* mgr_ch, dragonMemoryPoolDescr_t* pool, const dragonULInt num_strm_chs, dragonChannelDescr_t** strm_channels, - const bool use_buffered_protocol, dragonFLIAttr_t* attrs) -{ + const bool use_buffered_protocol, dragonFLIAttr_t* attrs) { dragonError_t err; dragonFLIAttr_t def_attr; uint64_t msg_count; @@ -829,6 +840,7 @@ dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Cannot allocate new file-like interface adapter."); obj->attrs = *attrs; + obj->was_attached = false; /* created, not attached */ if (pool == NULL) { /* We will attach to the default pool in this case. */ @@ -849,12 +861,15 @@ dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, if (err != DRAGON_SUCCESS) append_err_return(err, "Cannot clone main channel descriptor."); - err = dragon_channel_message_count(&obj->main_ch, &msg_count); - if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not get the main channel message count during creation."); + if (!use_buffered_protocol) { + /* If we are using buffered protocol, then it does not need to be empty during creation. */ + err = dragon_channel_message_count(&obj->main_ch, &msg_count); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get the main channel message count during creation."); - if (msg_count > 0) - err_return(DRAGON_INVALID_ARGUMENT, "The main channel has items in it during adapter creation."); + if (msg_count > 0) + err_return(DRAGON_INVALID_ARGUMENT, "The main channel has items in it during adapter creation."); + } obj->has_main_ch = true; } else @@ -892,9 +907,7 @@ dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, } -dragonError_t -dragon_fli_destroy(dragonFLIDescr_t* adapter) -{ +dragonError_t dragon_fli_destroy(dragonFLIDescr_t* adapter) { dragonError_t err; dragonFLI_t* obj; @@ -917,20 +930,33 @@ dragon_fli_destroy(dragonFLIDescr_t* adapter) append_err_return(err, "Could not empty the main channel."); } + if (obj->was_attached) { + if (obj->has_main_ch) { + err = dragon_channel_detach(&obj->main_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot detach from main channel of adapter."); + } + + if (obj->has_mgr_ch) { + err = dragon_channel_detach(&obj->mgr_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot detach from manager channel of adapter."); + } + } + err = dragon_umap_delitem(dg_fli_adapters, adapter->_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to delete adapter from from adapters umap"); free(obj); + obj = NULL; adapter->_idx = 0; no_err_return(DRAGON_SUCCESS); } -dragonError_t -dragon_fli_serialize(const dragonFLIDescr_t* adapter, dragonFLISerial_t* serial) -{ +dragonError_t dragon_fli_serialize(const dragonFLIDescr_t* adapter, dragonFLISerial_t* serial) { dragonError_t err; dragonFLI_t* obj; uint8_t adapter_type = 0; @@ -1007,26 +1033,24 @@ dragon_fli_serialize(const dragonFLIDescr_t* adapter, dragonFLISerial_t* serial) } -dragonError_t -dragon_fli_serial_free(dragonFLISerial_t* serial) -{ +dragonError_t dragon_fli_serial_free(dragonFLISerial_t* serial) { if (serial == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Invalid serialized fli adapter."); if (serial->data == NULL) no_err_return(DRAGON_SUCCESS); - free(serial->data); + if (serial->data != NULL) + free(serial->data); + serial->data = NULL; no_err_return(DRAGON_SUCCESS); } -dragonError_t -dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t* pool, - dragonFLIDescr_t* adapter) -{ +dragonError_t dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t* pool, + dragonFLIDescr_t* adapter) { dragonError_t err; dragonFLI_t* obj; uint8_t adapter_type = 0; @@ -1064,6 +1088,7 @@ dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t obj->attrs = *attrs; obj->num_strm_chs = 0; /* We don't keep track of it the channels in attached objects */ + obj->was_attached = true; /* was attached, not created */ if (pool == NULL) { /* We will attach to the default pool in this case. */ @@ -1113,9 +1138,7 @@ dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t } -dragonError_t -dragon_fli_detach(dragonFLIDescr_t* adapter) -{ +dragonError_t dragon_fli_detach(dragonFLIDescr_t* adapter) { dragonError_t err; dragonFLI_t* obj; @@ -1126,21 +1149,79 @@ dragon_fli_detach(dragonFLIDescr_t* adapter) if (err != DRAGON_SUCCESS) append_err_return(err, "Could not resolve adapter to internal fli object"); + if (obj->was_attached) { + if (obj->has_main_ch) { + err = dragon_channel_detach(&obj->main_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot detach from main channel of adapter."); + } + + if (obj->has_mgr_ch) { + err = dragon_channel_detach(&obj->mgr_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot detach from manager channel of adapter."); + } + } + err = dragon_umap_delitem(dg_fli_adapters, adapter->_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to delete adapter from from adapters umap"); free(obj); + obj = NULL; adapter->_idx = 0; no_err_return(DRAGON_SUCCESS); } - dragonError_t -dragon_fli_open_send_handle(const dragonFLIDescr_t* adapter, dragonFLISendHandleDescr_t* send_handle, - dragonChannelDescr_t* strm_ch, const timespec_t* timeout) -{ +dragon_fli_get_available_streams(dragonFLIDescr_t* adapter, uint64_t* num_streams, const timespec_t* timeout) { + dragonError_t err; + dragonFLI_t* obj; + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + err = _fli_from_descr(adapter, &obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve adapter to internal fli object"); + + if (obj->has_mgr_ch) { + /* This works with both an on-node and off-node manager channel */ + + err = dragon_channel_poll(&obj->mgr_ch, DRAGON_IDLE_WAIT, DRAGON_CHANNEL_POLLSIZE, timeout, num_streams); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not empty the manager channel."); + + no_err_return(DRAGON_SUCCESS); + } + + err_return(DRAGON_INVALID_ARGUMENT, "The fli adapter does not have a manager channel and therefore calling dragon_fli_get_available_streams is invalid."); +} + + +dragonError_t dragon_fli_is_buffered(const dragonFLIDescr_t* adapter, bool* is_buffered) { + dragonError_t err; + dragonFLI_t* obj; + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + if (is_buffered == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The is_buffered variable cannot be NULL."); + + err = _fli_from_descr(adapter, &obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve adapter to internal fli object"); + + *is_buffered = obj->use_buffered_protocol; + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t dragon_fli_open_send_handle(const dragonFLIDescr_t* adapter, dragonFLISendHandleDescr_t* send_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout) { dragonError_t err; dragonFLI_t* obj; dragonFLISendHandle_t* sendh_obj; @@ -1254,13 +1335,12 @@ dragon_fli_open_send_handle(const dragonFLIDescr_t* adapter, dragonFLISendHandle } -dragonError_t -dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, const timespec_t* timeout) -{ +dragonError_t dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, const timespec_t* timeout) { dragonError_t err; dragonFLISendHandle_t* sendh_obj; timespec_t* deadline = NULL; timespec_t end_time; + uint8_t dummy = 0; if (send_handle == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); @@ -1288,7 +1368,7 @@ dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, const time if (!sendh_obj->adapter->use_buffered_protocol) { /* sending the EOT indicator for the stream. */ - err = _send_bytes(&sendh_obj->chan_sendh, &sendh_obj->adapter->pool, NULL, 0, FLI_EOT, deadline); + err = _send_bytes(&sendh_obj->chan_sendh, &sendh_obj->adapter->pool, &dummy, 1, FLI_EOT, deadline); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not send the end of stream indicator down the stream channel."); } @@ -1305,15 +1385,14 @@ dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, const time send_handle->_idx = 0; free(sendh_obj); + sendh_obj = NULL; no_err_return(DRAGON_SUCCESS); } -dragonError_t -dragon_fli_open_recv_handle(const dragonFLIDescr_t* adapter, dragonFLIRecvHandleDescr_t* recv_handle, - dragonChannelDescr_t* strm_ch, const timespec_t* timeout) -{ +dragonError_t dragon_fli_open_recv_handle(const dragonFLIDescr_t* adapter, dragonFLIRecvHandleDescr_t* recv_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout) { dragonError_t err; dragonFLI_t* obj; dragonFLIRecvHandle_t* recvh_obj; @@ -1431,9 +1510,7 @@ dragon_fli_open_recv_handle(const dragonFLIDescr_t* adapter, dragonFLIRecvHandle } -dragonError_t -dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const timespec_t* timeout) -{ +dragonError_t dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const timespec_t* timeout) { dragonError_t err; dragonFLIRecvHandle_t* recvh_obj; timespec_t* deadline = NULL; @@ -1482,15 +1559,37 @@ dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const time recv_handle->_idx = 0; free(recvh_obj); + recvh_obj = NULL; no_err_return(DRAGON_SUCCESS); } + dragonError_t -dragon_fli_create_writable_fd(dragonFLISendHandleDescr_t* send_handle, int* fd_ptr, +dragon_fli_stream_received(dragonFLIRecvHandleDescr_t* recv_handle, bool* stream_received) { + + dragonError_t err; + dragonFLIRecvHandle_t* recvh_obj; + + if (recv_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli receive handle descriptor"); + + if (stream_received == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The stream_received variable cannot be NULL."); + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + *stream_received = recvh_obj->stream_received; + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t dragon_fli_create_writable_fd(dragonFLISendHandleDescr_t* send_handle, int* fd_ptr, const bool buffer, size_t chunk_size, - const uint64_t user_arg, const timespec_t* timeout) -{ + const uint64_t user_arg, const timespec_t* timeout) { dragonError_t err; dragonFLISendHandle_t* sendh_obj; @@ -1537,9 +1636,7 @@ dragon_fli_create_writable_fd(dragonFLISendHandleDescr_t* send_handle, int* fd_p no_err_return(DRAGON_SUCCESS); } -dragonError_t -dragon_fli_finalize_writable_fd(dragonFLISendHandleDescr_t* send_handle) -{ +dragonError_t dragon_fli_finalize_writable_fd(dragonFLISendHandleDescr_t* send_handle) { dragonError_t err; dragonFLISendHandle_t* sendh_obj; @@ -1569,9 +1666,7 @@ dragon_fli_finalize_writable_fd(dragonFLISendHandleDescr_t* send_handle) no_err_return(DRAGON_SUCCESS); } -dragonError_t -dragon_fli_create_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle, int* fd_ptr, const timespec_t* timeout) -{ +dragonError_t dragon_fli_create_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle, int* fd_ptr, const timespec_t* timeout) { dragonError_t err; dragonFLIRecvHandle_t* recvh_obj; @@ -1613,9 +1708,7 @@ dragon_fli_create_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle, int* fd_p no_err_return(DRAGON_SUCCESS); } -dragonError_t -dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle) -{ +dragonError_t dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle) { dragonError_t err; dragonFLIRecvHandle_t* recvh_obj; @@ -1645,20 +1738,21 @@ dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle) no_err_return(DRAGON_SUCCESS); } -dragonError_t -dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, - uint8_t* bytes, uint64_t arg, const bool buffer, const timespec_t* timeout) -{ +dragonError_t dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, + uint8_t* bytes, uint64_t arg, const bool buffer, const timespec_t* timeout) { dragonError_t err; dragonFLISendHandle_t* sendh_obj; timespec_t* deadline = NULL; timespec_t end_time; + if (arg == FLI_EOT) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot specify an argument value of 0xFFFFFFFFFFFFFFFF. This value is reserved for internal usage."); + if (send_handle == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); if (bytes == NULL && num_bytes > 0) - err_return(DRAGON_INVALID_ARGUMENT, "Cannot send non-zer number of bytes with NULL pointer."); + err_return(DRAGON_INVALID_ARGUMENT, "Cannot send non-zero number of bytes with NULL pointer."); if (timeout != NULL) { deadline = &end_time; @@ -1688,15 +1782,15 @@ dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, } -dragonError_t -dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t* mem, - uint64_t arg, const timespec_t* timeout) -{ +dragonError_t dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t* mem, + uint64_t arg, bool transfer_ownership, const timespec_t* timeout) { dragonError_t err; dragonFLISendHandle_t* sendh_obj; timespec_t* deadline = NULL; timespec_t end_time; + if (arg == FLI_EOT) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot specify an argument value of 0xFFFFFFFFFFFFFFFF. This value is reserved for internal usage."); if (send_handle == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); @@ -1719,7 +1813,7 @@ dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t err_return(DRAGON_INVALID_ARGUMENT, "You cannot use dragon_fli_send_mem on a buffered fli adapter. Use dragon_fli_send_bytes instead."); /* sending mem on stream channel */ - err = _send_mem(&sendh_obj->chan_sendh, mem, arg, deadline); + err = _send_mem(&sendh_obj->chan_sendh, mem, arg, transfer_ownership, deadline); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not send the managed memory down the stream channel."); @@ -1727,11 +1821,9 @@ dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t } -dragonError_t -dragon_fli_recv_bytes_into(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, +dragonError_t dragon_fli_recv_bytes_into(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, size_t* received_size, uint8_t* bytes, uint64_t* arg, - const timespec_t* timeout) -{ + const timespec_t* timeout) { dragonError_t err; uint8_t* buffer_ptr = bytes; @@ -1746,11 +1838,9 @@ dragon_fli_recv_bytes_into(dragonFLIRecvHandleDescr_t* recv_handle, size_t reque } -dragonError_t -dragon_fli_recv_bytes(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, +dragonError_t dragon_fli_recv_bytes(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, size_t* received_size, uint8_t** bytes, uint64_t* arg, - const timespec_t* timeout) -{ + const timespec_t* timeout) { dragonError_t err; if (bytes == NULL) @@ -1762,17 +1852,14 @@ dragon_fli_recv_bytes(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_ err = _recv_bytes_common(recv_handle, requested_size, received_size, bytes, arg, timeout); if (err != DRAGON_SUCCESS && err != DRAGON_EOT) - append_err_return(err, "Could not receive bytes into."); + append_err_return(err, "Could not receive bytes."); no_err_return(err); } -dragonError_t -dragon_fli_recv_mem(dragonFLIRecvHandleDescr_t* recv_handle, dragonMemoryDescr_t* mem, - uint64_t* arg, const timespec_t* timeout) - -{ +dragonError_t dragon_fli_recv_mem(dragonFLIRecvHandleDescr_t* recv_handle, dragonMemoryDescr_t* mem, + uint64_t* arg, const timespec_t* timeout) { dragonError_t err; dragonFLIRecvHandle_t* recvh_obj; timespec_t* deadline = NULL; @@ -1817,7 +1904,7 @@ dragon_fli_recv_mem(dragonFLIRecvHandleDescr_t* recv_handle, dragonMemoryDescr_t recvh_obj->num_bytes_received += received_size; - if (received_size == 0 && *arg == FLI_EOT) { + if (*arg == FLI_EOT) { recvh_obj->stream_received = true; *arg = 0; /* FLI_EOT is internal only so don't expose it. */ append_err_return(DRAGON_EOT, "Reached the end of stream"); diff --git a/src/lib/gpu/cuda.cpp b/src/lib/gpu/cuda.cpp new file mode 100644 index 0000000..f481c39 --- /dev/null +++ b/src/lib/gpu/cuda.cpp @@ -0,0 +1,233 @@ +#ifdef HAVE_CUDA_INCLUDE + +#include "cuda.hpp" + +// init cuda backend + +cudaError_t (*fn_cudaMalloc)(void **addr, size_t size); +cudaError_t (*fn_cudaFree)(void *addr); +cudaError_t (*fn_cudaMemcpy)(void* dst_addr, const void* src_addr, size_t size, cudaMemcpyKind kind); +cudaError_t (*fn_cudaMemset)(void *addr, int val, size_t num_bytes); +cudaError_t (*fn_cudaSetDeviceFlags)(unsigned int flags); +cudaError_t (*fn_cudaDeviceSynchronize)(void); +cudaError_t (*fn_cudaIpcGetMemHandle)(cudaIpcMemHandle_t *ipc_handle, void *addr); +cudaError_t (*fn_cudaIpcOpenMemHandle)(void **addr, cudaIpcMemHandle_t ipc_handle, unsigned int flags); +cudaError_t (*fn_cudaIpcCloseMemHandle)(void *addr); +const char* (*fn_cudaGetErrorString)(cudaError_t cuda_rc); + +void * +dragon_gpu_open_cuda_lib() +{ + return dlopen(dragonGPU_cuda::libname, RTLD_LAZY | RTLD_GLOBAL); +} + +dragonError_t +dragon_gpu_resolve_cuda_symbols(void *libhandle) +{ + fn_cudaMalloc = (cudaError_t (*)(void **, size_t)) dlsym(libhandle, "cudaMalloc"); + assert(fn_cudaMalloc != nullptr); + + fn_cudaFree = (cudaError_t (*)(void *)) dlsym(libhandle, "cudaFree"); + assert(fn_cudaFree != nullptr); + + fn_cudaMemcpy = (cudaError_t (*)(void *, const void *, size_t, cudaMemcpyKind)) dlsym(libhandle, "cudaMalloc"); + assert(fn_cudaMemcpy != nullptr); + + fn_cudaMemset = (cudaError_t (*)(void *, int, size_t)) dlsym(libhandle, "cudaMemset"); + assert(fn_cudaMemset != nullptr); + + fn_cudaSetDeviceFlags = (cudaError_t (*)(unsigned int)) dlsym(libhandle, "cudaSetDeviceFlags"); + assert(fn_cudaSetDeviceFlags != nullptr); + + fn_cudaDeviceSynchronize = (cudaError_t (*)(void)) dlsym(libhandle, "cudaDeviceSynchronize"); + assert(fn_cudaDeviceSynchronize != nullptr); + + fn_cudaIpcGetMemHandle = (cudaError_t (*)(cudaIpcMemHandle_t *, void *)) dlsym(libhandle, "cudaIpcGetMemHandle"); + assert(fn_cudaIpcGetMemHandle != nullptr); + + fn_cudaIpcOpenMemHandle = (cudaError_t (*)(void **, cudaIpcMemHandle_t, unsigned int)) dlsym(libhandle, "cudaIpcOpenMemHandle"); + assert(fn_cudaIpcOpenMemHandle != nullptr); + + fn_cudaIpcCloseMemHandle = (cudaError_t (*)(void *)) dlsym(libhandle, "cudaIpcCloseMemHandle"); + assert(fn_cudaIpcCloseMemHandle != nullptr); + + fn_cudaGetErrorString = (const char* (*)(cudaError_t)) dlsym(libhandle, "cudaGetErrorString"); + assert(fn_cudaGetErrorString != nullptr); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_gpu_setup_cuda(void *libhandle, dragonGPUHandle_t *gpuh) +{ + dragon_gpu_resolve_cuda_symbols(libhandle); + + try { + gpuh->dgpu = std::make_shared(); + } catch (std::exception& e) { + append_err_return(DRAGON_FAILURE, e.what()); + } + + no_err_return(DRAGON_SUCCESS); +} + +// member function definitions + +dragonGPU_cuda::dragonGPU_cuda() +{ + this->backend_type = DRAGON_GPU_BACKEND_CUDA; + + auto flags = cudaDeviceScheduleBlockingSync; + + auto cuda_rc = fn_cudaSetDeviceFlags(flags); + if (cuda_rc != cudaSuccess) { + auto errstr = this->get_errstr("failed to set device flags", cuda_rc); + throw std::runtime_error(errstr.c_str()); + } +} + +dragonError_t +dragonGPU_cuda::mem_alloc(void **addr, size_t size) +{ + auto cuda_rc = fn_cudaMalloc(addr, size); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to allocate device memory", cuda_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_cuda::mem_free(void *addr) +{ + auto cuda_rc = fn_cudaFree(addr); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to free device memory", cuda_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_cuda::get_ipc_handle(void *addr, std::vector& ipc_handle_out) +{ + cudaIpcMemHandle_t ipc_handle; + + auto cuda_rc = fn_cudaIpcGetMemHandle(&ipc_handle, addr); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to get IPC handle", cuda_rc).c_str()); + } + + ipc_handle_out.resize(sizeof(cudaIpcMemHandle_t)); + memcpy(&ipc_handle_out[0], &ipc_handle, ipc_handle_out.size()); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_cuda::free_ipc_handle(std::vector& ipc_handle) +{ + // this is a no-op for cuda + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_cuda::attach(std::vector& ipc_handle_in, void **addr) +{ + cudaIpcMemHandle_t ipc_handle; + memcpy(&ipc_handle, &ipc_handle_in[0], sizeof(cudaIpcMemHandle_t)); + + auto flags = cudaIpcMemLazyEnablePeerAccess; + + auto cuda_rc = fn_cudaIpcOpenMemHandle(addr, ipc_handle, flags); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to open IPC handle", cuda_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_cuda::detach(void *addr) +{ + auto cuda_rc = fn_cudaIpcCloseMemHandle(addr); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to close IPC handle", cuda_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +static cudaMemcpyKind +dragon_to_cuda_memcpy_kind(dragonGPUMemcpyType_t memcpy_type) +{ + switch (memcpy_type) { + case DRAGON_GPU_D2D: { + return cudaMemcpyDeviceToDevice; + } + case DRAGON_GPU_D2H: { + return cudaMemcpyDeviceToHost; + } + case DRAGON_GPU_H2D: { + return cudaMemcpyHostToDevice; + } + default: { + assert("invalid memcpy type"); + } + } + + return cudaMemcpyDefault; +} + +dragonError_t +dragonGPU_cuda::copy(void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) +{ + auto cuda_rc = fn_cudaMemcpy(dst_addr, src_addr, size, dragon_to_cuda_memcpy_kind(memcpy_type)); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to copy memory", cuda_rc).c_str()); + } + + cuda_rc = fn_cudaDeviceSynchronize(); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to synchronize device", cuda_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_cuda::memset(void *addr, int val, size_t num_bytes) +{ + auto cuda_rc = fn_cudaMemset(addr, val, num_bytes); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to set memory", cuda_rc).c_str()); + } + + cuda_rc = fn_cudaDeviceSynchronize(); + if (cuda_rc != cudaSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to synchronize device", cuda_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +std::string +dragonGPU_cuda::get_errstr(const char *event, int cuda_rc) +{ + auto errstr = fn_cudaGetErrorString((cudaError_t) cuda_rc); + + auto log_str = + std::string(event) + + std::string(": rc=") + std::to_string(cuda_rc) + + (errstr ? std::string(", ") + std::string(errstr) : std::string("")); + + if (dragon_gpu_debug) { + fprintf(dragon_gpu_log, "%s", log_str.c_str()); + fflush(dragon_gpu_log); + } + + return log_str; +} + +#endif // HAVE_CUDA_INCLUDE + diff --git a/src/lib/gpu/cuda.hpp b/src/lib/gpu/cuda.hpp new file mode 100644 index 0000000..cee47ea --- /dev/null +++ b/src/lib/gpu/cuda.hpp @@ -0,0 +1,43 @@ +#ifndef HAVE_DRAGON_GPU_CUDA_HPP +#define HAVE_DRAGON_GPU_CUDA_HPP + +#include "gpu.hpp" +#include "cuda_runtime.h" + +class dragonGPU_cuda final : public dragonGPU { +public: + + static constexpr const char *libname{"libcudart.so"}; + + dragonGPU_cuda(); + + dragonError_t + mem_alloc(void **addr, size_t size) override; + + dragonError_t + mem_free(void *addr) override; + + dragonError_t + get_ipc_handle(void *addr, std::vector& ipc_handle) override; + + dragonError_t + free_ipc_handle(std::vector& ipc_handle) override; + + dragonError_t + attach(std::vector& ipc_handle, void **addr) override; + + dragonError_t + detach(void *addr) override; + + dragonError_t + copy(void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) override; + + dragonError_t + memset(void *addr, int val, size_t num_bytes) override; + + std::string + get_errstr(const char *event, int cuda_rc) override; +}; + +#endif // HAVE_DRAGON_GPU_CUDA_HPP + diff --git a/src/lib/gpu/gpu.cpp b/src/lib/gpu/gpu.cpp new file mode 100644 index 0000000..0c05692 --- /dev/null +++ b/src/lib/gpu/gpu.cpp @@ -0,0 +1,291 @@ +#include "gpu.hpp" + +bool dragon_gpu_debug = false; +FILE *dragon_gpu_log = nullptr; + +/** + * @brief Set up a GPU backend and obtain a handle to it. + * + * @param backend_type IN constant indicating which vendor backend to use + * @param gpuh OUT handle to the GPU + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_setup(dragonGPUBackend_t backend_type, dragonGPUHandle_t *gpuh) +{ + gpuh->lock.acquire(); + + // set up debugging log file + + auto tmp_envstr = getenv("_DRAGON_GPU_DEBUG"); + if (tmp_envstr != nullptr) { + dragon_gpu_debug = bool(atoi(tmp_envstr)); + } + + if (dragon_gpu_debug) { + char dbg_filename[256]; + char my_hostname[128]; + gethostname(my_hostname, 128); + sprintf(dbg_filename, "dragon_gpu.%s.%d.log", my_hostname, getpid()); + dragon_gpu_log = fopen(dbg_filename, "w"); + if (dragon_gpu_log == nullptr) { + append_err_return(DRAGON_FAILURE, "failed to open Dragon GPU debug log"); + } + } + + // set up gpu library + + switch (backend_type) { +#ifdef HAVE_CUDA_INCLUDE + case DRAGON_GPU_BACKEND_CUDA: { + auto libhandle = dragon_gpu_open_cuda_lib(); + if (libhandle) { + dragon_gpu_setup_cuda(libhandle, gpuh); + } else { + append_err_return(DRAGON_FAILURE, "failed to dlopen CUDA backend library"); + } + break; + } +#endif // HAVE_CUDA_INCLUDE +#ifdef HAVE_HIP_INCLUDE + case DRAGON_GPU_BACKEND_HIP: { + auto libhandle = dragon_gpu_open_hip_lib(); + if (libhandle) { + dragon_gpu_setup_hip(libhandle, gpuh); + } else { + append_err_return(DRAGON_FAILURE, "failed to dlopen HIP backend library"); + } + break; + } +#endif // HAVE_HIP_INCLUDE +#ifdef HAVE_ZE_INCLUDE + case DRAGON_GPU_BACKEND_ZE: { + auto libhandle = dragon_gpu_open_ze_lib(); + if (libhandle) { + dragon_gpu_setup_ze(libhandle, gpuh); + } else { + append_err_return(DRAGON_FAILURE, "failed to dlopen ZE backend library"); + } + break; + } +#endif // HAVE_ZE_INCLUDE + default: { + append_err_return(DRAGON_FAILURE, "invalid GPU backend type"); + } + } + + if (gpuh->dgpu != nullptr) { + gpuh->lock.release(); + no_err_return(DRAGON_SUCCESS); + } else { + gpuh->lock.release(); + err_return(DRAGON_FAILURE, "no GPU detected"); + } +} + +/** + * @brief Clean up resources for a GPU backend. + * + * @param gpuh IN handle to the GPU + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_cleanup(dragonGPUHandle_t *gpuh) +{ + gpuh->lock.acquire(); + gpuh->dgpu.reset(); + gpuh->lock.release(); + no_err_return(DRAGON_SUCCESS); +} + +/** + * @brief Allocate memory on the device specified by @ref gpuh. + * + * @param gpuh IN handle to the GPU + * @param addr OUT pointer to the base of the memory allocation + * @param size IN size of the memory allocation + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_mem_alloc(dragonGPUHandle_t *gpuh, void **addr, size_t size) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->mem_alloc(addr, size); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Free memory on the device specified by @ref gpuh. + * + * @param gpuh IN handle to the GPU + * @param addr IN pointer to the base of the memory to be freed + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_mem_free(dragonGPUHandle_t *gpuh, void *addr) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->mem_free(addr); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Get an IPC handle for a memory allocation that can be shared with other processes. + * + * @param gpuh IN handle to the GPU + * @param addr IN pointer to the base of the allocation for the IPC handle + * @param ipc_handle OUT the IPC handle + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_get_ipc_handle(dragonGPUHandle_t *gpuh, void *addr, dragonIPCHandle_t *ipc_handle) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->get_ipc_handle(addr, ipc_handle->data); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Free an IPC handle. + * + * The process that calls @ref dragon_gpu_get_ipc_handle must call this function + * to clean up the IPC handle. + * + * @param gpuh IN handle to the GPU + * @param ipc_handle IN the IPC handle + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_free_ipc_handle(dragonGPUHandle_t *gpuh, dragonIPCHandle_t *ipc_handle) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->free_ipc_handle(ipc_handle->data); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Use an IPC handle to attach to an inter-process memory allocation. + * + * A process that receives an IPC handle can use this function to attach to an + * inter-process memory allocation. Once attached, this process can access the + * allocation using a local virtual address segment. + * + * @param gpuh IN handle to the GPU + * @param ipc_handle IN the IPC handle + * @param addr OUT pointer to the base of the allocation for the IPC handle + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_attach(dragonGPUHandle_t *gpuh, dragonIPCHandle_t *ipc_handle, void **addr) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->attach(ipc_handle->data, addr); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Detach from an inter-process memory allocation. + * + * Once a process detaches from an inter-process memory allocation, it will + * lose access to the allocation, but the allocation will not be freed. The + * process that originally called @ref dragon_gpu_mem_alloc must free the + * memory. + * + * @param gpuh IN handle to the GPU + * @param addr IN pointer to the base of the allocation to detach from + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_detach(dragonGPUHandle_t *gpuh, void *addr) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->detach(addr); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Copy data between device buffers, or between device and host buffers. + * + * The direction of the memory copy is determined by the @ref memcpy_type parameter. + * The direction can be either device-to-device (DRAGON_GPU_D2D), device-to-host (DRAGON_GPU_D2H) + * or host-to-device (DRAGON_GPU_H2D). + * + * @param gpuh IN handle to the GPU + * @param dst_addr INOUT the destination buffer + * @param src_addr IN the source buffer + * @param size IN size in bytes of the data to be copied + * @param memcpy_type IN direction of the memory copy + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_copy(dragonGPUHandle_t *gpuh, void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->copy(dst_addr, src_addr, size, memcpy_type); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Update the values stored in a buffer. + * + * @param gpuh IN handle to the GPU + * @param addr INOUT the buffer to be updated + * @param val IN value between 0 and 255 to use for each byte in the buffer + * @param size IN size in bytes of the data to updated + * + * @return An error code for the operation. DRAGON_SUCCESS upon success. + */ + +dragonError_t +dragon_gpu_memset(dragonGPUHandle_t *gpuh, void *addr, int val, size_t size) +{ + gpuh->lock.acquire(); + auto derr = gpuh->dgpu->memset(addr, val, size); + gpuh->lock.release(); + return derr; +} + +/** + * @brief Get an error string corresponding to a return code. + * + * @param gpuh IN handle to the GPU + * @param event IN C string describing the operation + * @param rc IN the return code + * @param errstr INOUT C string to store the description of the return code + * @param strlen IN maximum number of bytes that can be stored in @ref errstr + */ + +// TODO: this function needs work (including the backend implementations for it) +void +dragon_gpu_get_errstr(dragonGPUHandle_t *gpuh, const char *event, int rc, char *errstr, int strlen) +{ + gpuh->lock.acquire(); + strncpy(errstr, gpuh->dgpu->get_errstr(event, rc).c_str(), strlen); + gpuh->lock.release(); +} + diff --git a/src/lib/gpu/gpu.hpp b/src/lib/gpu/gpu.hpp new file mode 100644 index 0000000..8c681f4 --- /dev/null +++ b/src/lib/gpu/gpu.hpp @@ -0,0 +1,139 @@ +#ifndef HAVE_DRAGON_GPU_HPP +#define HAVE_DRAGON_GPU_HPP + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif // _GNU_SOURCE + +#include "../err.h" +#include "../shared_lock.hpp" +#include "dragon/return_codes.h" +#include "dragon/shared_lock.h" +#include "dragon/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern bool dragon_gpu_debug; +extern FILE *dragon_gpu_log; + +enum dragonGPUBackend_t { + DRAGON_GPU_BACKEND_CUDA = 0, + DRAGON_GPU_BACKEND_HIP, + DRAGON_GPU_BACKEND_ZE +}; + +enum dragonGPUMemcpyType_t { + DRAGON_GPU_D2D = 0, + DRAGON_GPU_D2H, + DRAGON_GPU_H2D +}; + +class dragonGPU { +protected: + + dragonGPUBackend_t backend_type; + int device_idx; + +public: + + virtual dragonError_t mem_alloc(void **addr, size_t size) = 0; + virtual dragonError_t mem_free(void *addr) = 0; + virtual dragonError_t get_ipc_handle(void *addr, std::vector& ipc_handle) = 0; + virtual dragonError_t free_ipc_handle(std::vector& ipc_handle) = 0; + virtual dragonError_t attach(std::vector& ipc_handle, void **addr) = 0; + virtual dragonError_t detach(void *addr) = 0; + virtual dragonError_t copy(void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) = 0; + virtual dragonError_t memset(void *addr, int val, size_t num_bytes) = 0; + virtual std::string get_errstr(const char *event, int rc) = 0; +}; + +struct dragonGPUHandle_t { + std::shared_ptr dgpu; + dragonLock lock; +}; + +struct dragonIPCHandle_t { + std::vector data; +}; + +// extern "C" decls for setup functions + +#ifdef __cplusplus +extern "C" { +#endif + +dragonError_t +dragon_gpu_setup(dragonGPUBackend_t gpu_backend_type, dragonGPUHandle_t *handle); + +dragonError_t +dragon_gpu_cleanup(dragonGPUHandle_t *handle); + +dragonError_t +dragon_gpu_setup_cuda(void *libhandle, dragonGPUHandle_t *handle); + +dragonError_t +dragon_gpu_setup_hip(void *libhandle, dragonGPUHandle_t *handle); + +dragonError_t +dragon_gpu_setup_ze(void *libhandle, dragonGPUHandle_t *handle); + +void * +dragon_gpu_open_cuda_lib(); + +dragonError_t +dragon_gpu_resolve_cuda_symbols(void *libhandle); + +void * +dragon_gpu_open_hip_lib(); + +dragonError_t +dragon_gpu_resolve_hip_symbols(void *libhandle); + +void * +dragon_gpu_open_ze_lib(); + +dragonError_t +dragon_gpu_resolve_ze_symbols(void *libhandle); + +dragonError_t +dragon_gpu_mem_alloc(dragonGPUHandle_t *handle, void **addr, size_t size); + +dragonError_t +dragon_gpu_mem_free(dragonGPUHandle_t *handle, void *addr); + +dragonError_t +dragon_gpu_get_ipc_handle(dragonGPUHandle_t *handle, void *addr, dragonIPCHandle_t *ipc_handle); + +dragonError_t +dragon_gpu_free_ipc_handle(dragonGPUHandle_t *handle, dragonIPCHandle_t *ipc_handle); + +dragonError_t +dragon_gpu_attach(dragonGPUHandle_t *handle, dragonIPCHandle_t *ipc_handle, void **addr); + +dragonError_t +dragon_gpu_detach(dragonGPUHandle_t *handle, void *addr); + +dragonError_t +dragon_gpu_copy(dragonGPUHandle_t *handle, void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type); + +dragonError_t +dragon_gpu_memset(dragonGPUHandle_t *handle, void *addr, int val, size_t num_bytes); + +void +dragon_gpu_get_errstr(dragonGPUHandle_t *handle, const char *event, int rc, char *errstr, int strlen); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // HAVE_DRAGON_GPU_HPP + diff --git a/src/lib/gpu/hip.cpp b/src/lib/gpu/hip.cpp new file mode 100644 index 0000000..2769009 --- /dev/null +++ b/src/lib/gpu/hip.cpp @@ -0,0 +1,233 @@ +#ifdef HAVE_HIP_INCLUDE + +#include "hip.hpp" + +// init hip backend + +hipError_t (*fn_hipMalloc)(void **addr, size_t size); +hipError_t (*fn_hipFree)(void *addr); +hipError_t (*fn_hipMemcpy)(void* dst_addr, const void* src_addr, size_t size, hipMemcpyKind kind); +hipError_t (*fn_hipMemset)(void *addr, int val, size_t num_bytes); +hipError_t (*fn_hipSetDeviceFlags)(unsigned int flags); +hipError_t (*fn_hipDeviceSynchronize)(void); +hipError_t (*fn_hipIpcGetMemHandle)(hipIpcMemHandle_t *ipc_handle, void *addr); +hipError_t (*fn_hipIpcOpenMemHandle)(void **addr, hipIpcMemHandle_t ipc_handle, unsigned int flags); +hipError_t (*fn_hipIpcCloseMemHandle)(void *addr); +const char* (*fn_hipGetErrorString)(hipError_t hip_rc); + +void * +dragon_gpu_open_hip_lib() +{ + return dlopen(dragonGPU_hip::libname, RTLD_LAZY | RTLD_GLOBAL); +} + +dragonError_t +dragon_gpu_resolve_hip_symbols(void *libhandle) +{ + fn_hipMalloc = (hipError_t (*)(void **, size_t)) dlsym(libhandle, "hipMalloc"); + assert(fn_hipMalloc != nullptr); + + fn_hipFree = (hipError_t (*)(void *)) dlsym(libhandle, "hipFree"); + assert(fn_hipFree != nullptr); + + fn_hipMemcpy = (hipError_t (*)(void *, const void *, size_t, hipMemcpyKind)) dlsym(libhandle, "hipMalloc"); + assert(fn_hipMemcpy != nullptr); + + fn_hipMemset = (hipError_t (*)(void *, int, size_t)) dlsym(libhandle, "hipMemset"); + assert(fn_hipMemset != nullptr); + + fn_hipSetDeviceFlags = (hipError_t (*)(unsigned int)) dlsym(libhandle, "hipSetDeviceFlags"); + assert(fn_hipSetDeviceFlags != nullptr); + + fn_hipDeviceSynchronize = (hipError_t (*)(void)) dlsym(libhandle, "hipDeviceSynchronize"); + assert(fn_hipDeviceSynchronize != nullptr); + + fn_hipIpcGetMemHandle = (hipError_t (*)(hipIpcMemHandle_t *, void *)) dlsym(libhandle, "hipIpcGetMemHandle"); + assert(fn_hipIpcGetMemHandle != nullptr); + + fn_hipIpcOpenMemHandle = (hipError_t (*)(void **, hipIpcMemHandle_t, unsigned int)) dlsym(libhandle, "hipIpcOpenMemHandle"); + assert(fn_hipIpcOpenMemHandle != nullptr); + + fn_hipIpcCloseMemHandle = (hipError_t (*)(void *)) dlsym(libhandle, "hipIpcCloseMemHandle"); + assert(fn_hipIpcCloseMemHandle != nullptr); + + fn_hipGetErrorString = (const char* (*)(hipError_t)) dlsym(libhandle, "hipGetErrorString"); + assert(fn_hipGetErrorString != nullptr); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_gpu_setup_hip(void *libhandle, dragonGPUHandle_t *gpuh) +{ + dragon_gpu_resolve_hip_symbols(libhandle); + + try { + gpuh->dgpu = std::make_shared(); + } catch (std::exception& e) { + append_err_return(DRAGON_FAILURE, e.what()); + } + + no_err_return(DRAGON_SUCCESS); +} + +// member function definitions + +dragonGPU_hip::dragonGPU_hip() +{ + this->backend_type = DRAGON_GPU_BACKEND_CUDA; + + auto flags = hipDeviceScheduleBlockingSync; + + auto hip_rc = fn_hipSetDeviceFlags(flags); + if (hip_rc != hipSuccess) { + auto errstr = this->get_errstr("failed to set device flags", hip_rc); + throw std::runtime_error(errstr.c_str()); + } +} + +dragonError_t +dragonGPU_hip::mem_alloc(void **addr, size_t size) +{ + auto hip_rc = fn_hipMalloc(addr, size); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to allocate device memory", hip_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_hip::mem_free(void *addr) +{ + auto hip_rc = fn_hipFree(addr); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to free device memory", hip_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_hip::get_ipc_handle(void *addr, std::vector& ipc_handle_out) +{ + hipIpcMemHandle_t ipc_handle; + + auto hip_rc = fn_hipIpcGetMemHandle(&ipc_handle, addr); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to get IPC handle", hip_rc).c_str()); + } + + ipc_handle_out.resize(sizeof(hipIpcMemHandle_t)); + memcpy(&ipc_handle_out[0], &ipc_handle, ipc_handle_out.size()); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_hip::free_ipc_handle(std::vector& ipc_handle) +{ + // this is a no-op for hip + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_hip::attach(std::vector& ipc_handle_in, void **addr) +{ + hipIpcMemHandle_t ipc_handle; + memcpy(&ipc_handle, &ipc_handle_in[0], sizeof(hipIpcMemHandle_t)); + + auto flags = hipIpcMemLazyEnablePeerAccess; + + auto hip_rc = fn_hipIpcOpenMemHandle(addr, ipc_handle, flags); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to open IPC handle", hip_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_hip::detach(void *addr) +{ + auto hip_rc = fn_hipIpcCloseMemHandle(addr); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to close IPC handle", hip_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +static hipMemcpyKind +dragon_to_hip_memcpy_kind(dragonGPUMemcpyType_t memcpy_type) +{ + switch (memcpy_type) { + case DRAGON_GPU_D2D: { + return hipMemcpyDeviceToDevice; + } + case DRAGON_GPU_D2H: { + return hipMemcpyDeviceToHost; + } + case DRAGON_GPU_H2D: { + return hipMemcpyHostToDevice; + } + default: { + assert("invalid memcpy type"); + } + } + + return hipMemcpyDefault; +} + +dragonError_t +dragonGPU_hip::copy(void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) +{ + auto hip_rc = fn_hipMemcpy(dst_addr, src_addr, size, dragon_to_hip_memcpy_kind(memcpy_type)); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to copy memory", hip_rc).c_str()); + } + + hip_rc = fn_hipDeviceSynchronize(); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to synchronize device", hip_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_hip::memset(void *addr, int val, size_t num_bytes) +{ + auto hip_rc = fn_hipMemset(addr, val, num_bytes); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to allocate device memory", hip_rc).c_str()); + } + + hip_rc = fn_hipDeviceSynchronize(); + if (hip_rc != hipSuccess) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to synchronize device", hip_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +std::string +dragonGPU_hip::get_errstr(const char *event, int hip_rc) +{ + auto errstr = fn_hipGetErrorString((hipError_t) hip_rc); + + auto log_str = + std::string(event) + + std::string(": rc=") + std::to_string(hip_rc) + + (errstr ? std::string(", ") + std::string(errstr) : std::string("")); + + if (dragon_gpu_debug) { + fprintf(dragon_gpu_log, "%s", log_str.c_str()); + fflush(dragon_gpu_log); + } + + return log_str; +} + +#endif // HAVE_HIP_INCLUDE + diff --git a/src/lib/gpu/hip.hpp b/src/lib/gpu/hip.hpp new file mode 100644 index 0000000..c2b29bc --- /dev/null +++ b/src/lib/gpu/hip.hpp @@ -0,0 +1,45 @@ +#ifndef HAVE_DRAGON_GPU_HIP_HPP +#define HAVE_DRAGON_GPU_HIP_HPP + +#define __HIP_PLATFORM_AMD__ + +#include "gpu.hpp" +#include "hip/hip_runtime.h" + +class dragonGPU_hip final : public dragonGPU { +public: + + static constexpr const char *libname{"libamdhip64.so"}; + + dragonGPU_hip(); + + dragonError_t + mem_alloc(void **addr, size_t size) override; + + dragonError_t + mem_free(void *addr) override; + + dragonError_t + get_ipc_handle(void *addr, std::vector& ipc_handle) override; + + dragonError_t + free_ipc_handle(std::vector& ipc_handle) override; + + dragonError_t + attach(std::vector& ipc_handle, void **addr) override; + + dragonError_t + detach(void *addr) override; + + dragonError_t + copy(void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) override; + + dragonError_t + memset(void *addr, int val, size_t num_bytes) override; + + std::string + get_errstr(const char *event, int hip_rc) override; +}; + +#endif // HAVE_DRAGON_GPU_HIP_HPP + diff --git a/src/lib/gpu/ze.cpp b/src/lib/gpu/ze.cpp new file mode 100644 index 0000000..d3eb0d0 --- /dev/null +++ b/src/lib/gpu/ze.cpp @@ -0,0 +1,730 @@ +#ifdef HAVE_ZE_INCLUDE + +#include "ze.hpp" + +// init ze backend + +ze_result_t (*fn_zeInit)(ze_init_flag_t flags); +ze_result_t (*fn_zeDriverGet)(uint32_t *count, ze_driver_handle_t *drivers); +ze_result_t (*fn_zeDeviceGet)(ze_driver_handle_t driver, uint32_t *count, ze_device_handle_t *devices); +ze_result_t (*fn_zeDeviceGetSubDevices)(ze_device_handle_t device, uint32_t *count, ze_device_handle_t *subdevices); +ze_result_t (*fn_zeContextCreate)(ze_driver_handle_t driver, ze_context_desc_t *desc, ze_context_handle_t *context); +ze_result_t (*fn_zeContextDestroy)(ze_context_handle_t context); +ze_result_t (*fn_zeContextSystemBarrier)(ze_context_handle_t context, ze_device_handle_t device); +ze_result_t (*fn_zeMemAllocDevice)(ze_context_handle_t context, const ze_device_mem_alloc_desc_t *device_desc, size_t size, size_t alignment, ze_device_handle_t device, void **addr); +ze_result_t (*fn_zeMemFree)(ze_context_handle_t context, void *addr); +ze_result_t (*fn_zeMemGetIpcHandle)(ze_context_handle_t context, void *addr, ze_ipc_mem_handle_t *ipc_handle); +ze_result_t (*fn_zeMemPutIpcHandle)(ze_context_handle_t context, ze_ipc_mem_handle_t ipc_handle); +ze_result_t (*fn_zeMemOpenIpcHandle)(ze_context_handle_t context, ze_device_handle_t device, ze_ipc_mem_handle_t ipc_handle, ze_ipc_memory_flags_t flags, void **addr); +ze_result_t (*fn_zeMemCloseIpcHandle)(ze_context_handle_t ipc_handle, void *addr); +ze_result_t (*fn_zeDeviceGetCommandQueueGroupProperties)(ze_device_handle_t subdevice, uint32_t *group_count, ze_command_queue_group_properties_t *cmdq_group_properties); +ze_result_t (*fn_zeDeviceGetMemoryProperties)(ze_device_handle_t subdevice, uint32_t *local_mem_count, ze_device_memory_properties_t *mem_properties); +ze_result_t (*fn_zeCommandListCreate)(ze_context_handle_t context, ze_device_handle_t subdevice, ze_command_list_desc_t *cmdl_desc, ze_command_list_handle_t *command_list); +ze_result_t (*fn_zeCommandListCreateImmediate)(ze_context_handle_t context, ze_device_handle_t subdevice, ze_command_queue_desc_t *cmdq_desc, ze_command_list_handle_t *command_list); +ze_result_t (*fn_zeCommandListDestroy)(ze_command_list_handle_t command_list); +ze_result_t (*fn_zeCommandQueueCreate)(ze_context_handle_t context, ze_device_handle_t subdevice, ze_command_queue_desc_t *cmdq_desc, ze_command_queue_handle_t *command_queue); +ze_result_t (*fn_zeCommandQueueDestroy)(ze_command_queue_handle_t command_queue); +ze_result_t (*fn_zeCommandQueueExecuteCommandLists)(ze_command_queue_handle_t command_queue, uint32_t num_lists, ze_command_list_handle_t *command_lists, ze_fence_handle_t fence); +ze_result_t (*fn_zeCommandQueueSynchronize)(ze_command_queue_handle_t command_queue, uint64_t timeout); +ze_result_t (*fn_zeCommandListAppendMemoryCopy)(ze_command_list_handle_t command_list, void *dst_addr, const void *src_addr, size_t size, ze_event_handle_t signal, uint32_t num_events, ze_event_handle_t *wait_events); +ze_result_t (*fn_zeCommandListAppendMemoryFill)(ze_command_list_handle_t command_list, void *addr, const void *pattern, size_t pattern_size, size_t size, ze_event_handle_t signal_event, uint32_t num_wait_events, ze_event_handle_t *wait_events); +ze_result_t (*fn_zeCommandListAppendBarrier)(ze_command_list_handle_t command_list, ze_event_handle_t signal_event, uint32_t num_wait_events, ze_event_handle_t *wait_events); +ze_result_t (*fn_zeCommandListClose)(ze_command_list_handle_t command_list); +ze_result_t (*fn_zeCommandListReset)(ze_command_list_handle_t command_list); +ze_result_t (*fn_zeFenceCreate)(ze_command_queue_handle_t command_queue, ze_fence_desc_t *fence_desc, ze_fence_handle_t *fence); +ze_result_t (*fn_zeFenceDestroy)(ze_fence_handle_t fence); +ze_result_t (*fn_zeFenceHostSynchronize)(ze_fence_handle_t fence, uint64_t timeout); +ze_result_t (*fn_zeFenceReset)(ze_fence_handle_t fence); +ze_result_t (*fn_zeEventPoolCreate)(ze_context_handle_t context, ze_event_pool_desc_t *event_pool_desc, uint32_t num_devices, ze_device_handle_t *devices, ze_event_pool_handle_t *event_pool); +ze_result_t (*fn_zeEventPoolDestroy)(ze_event_pool_handle_t event_pool); +ze_result_t (*fn_zeEventCreate)(ze_event_pool_handle_t event_pool, ze_event_desc_t *event_desc, ze_event_handle_t *event); +ze_result_t (*fn_zeEventDestroy)(ze_event_handle_t event); +ze_result_t (*fn_zeEventHostSynchronize)(ze_event_handle_t event, uint64_t timeout); +ze_result_t (*fn_zeEventHostReset)(ze_event_handle_t event); +ze_result_t (*fn_zeDriverGetLastErrorDescription)(ze_driver_handle_t driver, const char **last_err); + +void * +dragon_gpu_open_ze_lib() +{ + return dlopen(dragonGPU_ze::libname, RTLD_LAZY | RTLD_GLOBAL); +} + +dragonError_t +dragon_gpu_resolve_ze_symbols(void *libhandle) +{ + fn_zeInit = (ze_result_t (*)(ze_init_flag_t)) dlsym(libhandle, "zeInit"); + assert(fn_zeInit != nullptr); + + fn_zeDriverGet = (ze_result_t (*)(uint32_t *, ze_driver_handle_t *)) dlsym(libhandle, "zeDriverGet"); + assert(fn_zeDriverGet != nullptr); + + fn_zeDeviceGet = (ze_result_t (*)(ze_driver_handle_t, uint32_t *, ze_device_handle_t *)) dlsym(libhandle, "zeDeviceGet"); + assert(fn_zeDeviceGet != nullptr); + + fn_zeDeviceGetSubDevices = (ze_result_t (*)(ze_device_handle_t, uint32_t *, ze_device_handle_t *)) dlsym(libhandle, "zeDeviceGetSubDevices"); + assert(fn_zeDeviceGetSubDevices != nullptr); + + fn_zeContextCreate = (ze_result_t (*)(ze_driver_handle_t, ze_context_desc_t *, ze_context_handle_t *)) dlsym(libhandle, "zeContextCreate"); + assert(fn_zeContextCreate != nullptr); + + fn_zeContextDestroy = (ze_result_t (*)(ze_context_handle_t)) dlsym(libhandle, "zeContextDestroy"); + assert(fn_zeContextCreate != nullptr); + + // TODO: This function doesn't seem to exist in the libze_intel_gpu + // library on pinoak, even though it appears in online docs. + //fn_zeContextSystemBarrier = (ze_result_t (*)(ze_context_handle_t, ze_device_handle_t)) dlsym(libhandle, "zeContextSystemBarrier"); + //assert(fn_zeContextSystemBarrier != nullptr); + + fn_zeMemAllocDevice = (ze_result_t (*)(ze_context_handle_t, const ze_device_mem_alloc_desc_t *, size_t, size_t, ze_device_handle_t, void **)) dlsym(libhandle, "zeMemAllocDevice"); + assert(fn_zeMemAllocDevice != nullptr); + + fn_zeMemFree = (ze_result_t (*)(ze_context_handle_t, void *)) dlsym(libhandle, "zeMemFree"); + assert(fn_zeMemFree != nullptr); + + fn_zeMemGetIpcHandle = (ze_result_t (*)(ze_context_handle_t, void *, ze_ipc_mem_handle_t *)) dlsym(libhandle, "zeMemGetIpcHandle"); + assert(fn_zeMemGetIpcHandle != nullptr); + + fn_zeMemPutIpcHandle = (ze_result_t (*)(ze_context_handle_t, ze_ipc_mem_handle_t)) dlsym(libhandle, "zeMemPutIpcHandle"); + // zeMemPutIpcHandle is new with 1.10, so it might be null + //assert(fn_zeMemPutIpcHandle != nullptr); + + fn_zeMemOpenIpcHandle = (ze_result_t (*)(ze_context_handle_t, ze_device_handle_t, ze_ipc_mem_handle_t, ze_ipc_memory_flags_t, void **)) dlsym(libhandle, "zeMemOpenIpcHandle"); + assert(fn_zeMemOpenIpcHandle != nullptr); + + fn_zeMemCloseIpcHandle = (ze_result_t (*)(ze_context_handle_t, void *)) dlsym(libhandle, "zeMemCloseIpcHandle"); + assert(fn_zeMemCloseIpcHandle != nullptr); + + fn_zeDeviceGetCommandQueueGroupProperties = (ze_result_t (*)(ze_device_handle_t, uint32_t *, ze_command_queue_group_properties_t *)) dlsym(libhandle, "zeDeviceGetCommandQueueGroupProperties"); + assert(fn_zeDeviceGetCommandQueueGroupProperties != nullptr); + + fn_zeDeviceGetMemoryProperties = (ze_result_t (*)(ze_device_handle_t, uint32_t *, ze_device_memory_properties_t *)) dlsym(libhandle, "zeDeviceGetMemoryProperties"); + assert(fn_zeDeviceGetMemoryProperties != nullptr); + + fn_zeCommandListCreate = (ze_result_t (*)(ze_context_handle_t, ze_device_handle_t, ze_command_list_desc_t *, ze_command_list_handle_t *)) dlsym(libhandle, "zeCommandListCreate"); + assert(fn_zeCommandListCreate != nullptr); + + fn_zeCommandListCreateImmediate = (ze_result_t (*)(ze_context_handle_t, ze_device_handle_t, ze_command_queue_desc_t *, ze_command_list_handle_t *)) dlsym(libhandle, "zeCommandListCreateImmediate"); + assert(fn_zeCommandListCreateImmediate != nullptr); + + fn_zeCommandListDestroy = (ze_result_t (*)(ze_command_list_handle_t)) dlsym(libhandle, "zeCommandListDestroy"); + assert(fn_zeCommandListDestroy != nullptr); + + fn_zeCommandListAppendMemoryCopy = (ze_result_t (*)(ze_command_list_handle_t, void *, const void *, size_t, ze_event_handle_t, uint32_t, ze_event_handle_t *)) dlsym(libhandle, "zeCommandListAppendMemoryCopy"); + assert(fn_zeCommandListAppendMemoryCopy != nullptr); + + fn_zeCommandListAppendMemoryFill = (ze_result_t (*)(ze_command_list_handle_t, void *, const void *, size_t, size_t, ze_event_handle_t, uint32_t, ze_event_handle_t *)) dlsym(libhandle, "zeCommandListAppendMemoryFill"); + assert(fn_zeCommandListAppendMemoryFill != nullptr); + + fn_zeCommandListAppendBarrier = (ze_result_t (*)(ze_command_list_handle_t, ze_event_handle_t, uint32_t, ze_event_handle_t *)) dlsym(libhandle, "zeCommandListAppendBarrier"); + assert(fn_zeCommandListAppendBarrier != nullptr); + + fn_zeCommandListClose = (ze_result_t (*)(ze_command_list_handle_t)) dlsym(libhandle, "zeCommandListClose"); + assert(fn_zeCommandListClose != nullptr); + + fn_zeCommandListReset = (ze_result_t (*)(ze_command_list_handle_t)) dlsym(libhandle, "zeCommandListReset"); + assert(fn_zeCommandListReset != nullptr); + + fn_zeCommandQueueCreate = (ze_result_t (*)(ze_context_handle_t, ze_device_handle_t, ze_command_queue_desc_t *, ze_command_queue_handle_t *)) dlsym(libhandle, "zeCommandQueueCreate"); + assert(fn_zeCommandQueueCreate != nullptr); + + fn_zeCommandQueueDestroy = (ze_result_t (*)(ze_command_queue_handle_t)) dlsym(libhandle, "zeCommandQueueDestroy"); + assert(fn_zeCommandQueueDestroy != nullptr); + + fn_zeCommandQueueExecuteCommandLists = (ze_result_t (*)(ze_command_queue_handle_t, uint32_t, ze_command_list_handle_t *, ze_fence_handle_t)) dlsym(libhandle, "zeCommandQueueExecuteCommandLists"); + assert(fn_zeCommandQueueExecuteCommandLists != nullptr); + + fn_zeCommandQueueSynchronize = (ze_result_t (*)(ze_command_queue_handle_t, uint64_t)) dlsym(libhandle, "zeCommandQueueSynchronize"); + assert(fn_zeCommandQueueSynchronize != nullptr); + + fn_zeFenceCreate = (ze_result_t (*)(ze_command_queue_handle_t, ze_fence_desc_t *, ze_fence_handle_t *)) dlsym(libhandle, "zeFenceCreate"); + assert(fn_zeFenceCreate != nullptr); + + fn_zeFenceDestroy = (ze_result_t (*)(ze_fence_handle_t)) dlsym(libhandle, "zeFenceDestroy"); + assert(fn_zeFenceDestroy != nullptr); + + fn_zeFenceHostSynchronize = (ze_result_t (*)(ze_fence_handle_t, uint64_t)) dlsym(libhandle, "zeFenceHostSynchronize"); + assert(fn_zeFenceHostSynchronize != nullptr); + + fn_zeFenceReset = (ze_result_t (*)(ze_fence_handle_t)) dlsym(libhandle, "zeFenceReset"); + assert(fn_zeFenceReset != nullptr); + + fn_zeEventPoolCreate = (ze_result_t (*)(ze_context_handle_t, ze_event_pool_desc_t *, uint32_t, ze_device_handle_t *, ze_event_pool_handle_t *)) dlsym(libhandle, "zeEventPoolCreate"); + assert(fn_zeEventPoolCreate != nullptr); + + fn_zeEventPoolDestroy = (ze_result_t (*)(ze_event_pool_handle_t)) dlsym(libhandle, "zeEventPoolDestroy"); + assert(fn_zeEventPoolDestroy != nullptr); + + fn_zeEventCreate = (ze_result_t (*)(ze_event_pool_handle_t, ze_event_desc_t *, ze_event_handle_t *)) dlsym(libhandle, "zeEventCreate"); + assert(fn_zeEventCreate != nullptr); + + fn_zeEventDestroy = (ze_result_t (*)(ze_event_handle_t)) dlsym(libhandle, "zeEventDestroy"); + assert(fn_zeEventDestroy != nullptr); + + fn_zeEventHostSynchronize = (ze_result_t (*)(ze_event_handle_t, uint64_t)) dlsym(libhandle, "zeEventHostSynchronize"); + assert(fn_zeEventHostSynchronize != nullptr); + + fn_zeEventHostReset = (ze_result_t (*)(ze_event_handle_t)) dlsym(libhandle, "zeEventHostReset"); + assert(fn_zeEventHostReset != nullptr); + + // TODO: This function doesn't seem to exist in the libze_intel_gpu + // library on pinoak, even though it appears in online docs. + //fn_zeDriverGetLastErrorDescription = (ze_result_t (*)(ze_driver_handle_t, const char **)) dlsym(libhandle, "zeDriverGetLastErrorDescription"); + //assert(fn_zeDriverGetLastErrorDescription != nullptr); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_gpu_setup_ze(void *libhandle, dragonGPUHandle_t *gpuh) +{ + dragon_gpu_resolve_ze_symbols(libhandle); + + try { + gpuh->dgpu = std::make_shared(); + } catch (std::exception& e) { + append_err_return(DRAGON_FAILURE, e.what()); + } + + no_err_return(DRAGON_SUCCESS); +} + +// member function definitions + +Tile::Tile(dragonGPU_ze *gpu, ze_device_handle_t subdevice) +{ + this->gpu = gpu; + this->subdevice = subdevice; + this->local_mem_idx = 0u; + + // need to find the group ordinal for a command queue with copy + // functionality before we can create the command list/queue. + // also need to init stuff for selecting local memory ordinals + auto dragon_rc = this->init_ordinals(); + if (dragon_rc != DRAGON_SUCCESS) { + throw std::runtime_error("failed to initialize ordinals"); + } + + dragon_rc = this->create_command_objects(); + if (dragon_rc != DRAGON_SUCCESS) { + throw std::runtime_error("failed to create command objects"); + } +} + +dragonError_t +Tile::init_ordinals() +{ + // get all the command queue groups + + auto cmdq_group_count = 0u; + + auto ze_rc = fn_zeDeviceGetCommandQueueGroupProperties(this->subdevice, &cmdq_group_count, nullptr); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to get command queue group properties", ze_rc).c_str()); + } + + std::vector cmdq_group_properties; + cmdq_group_properties.resize(cmdq_group_count); + + for (auto i = 0u; i < cmdq_group_count; ++i) { + cmdq_group_properties[i].stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES; + cmdq_group_properties[i].pNext = nullptr; + } + + ze_rc = fn_zeDeviceGetCommandQueueGroupProperties(this->subdevice, &cmdq_group_count, &cmdq_group_properties[0]); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to get command queue group properties", ze_rc).c_str()); + } + + // find a command queue group that supports copying + this->copyq_group_ord = cmdq_group_count; + for (auto i = 0u; i < cmdq_group_count; ++i) { + if (cmdq_group_properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) { + this->copyq_group_ord = i; + break; + } + } + + if (this->copyq_group_ord == cmdq_group_count) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to find a command queue group that supports copying", ze_rc).c_str()); + } + + // the local memory ordinal needs to be less than the count + // returned from zeDeviceGetMemoryProperties. we will use + // round-robin to select such an ordinal + + this->local_mem_count = 0u; + + ze_rc = fn_zeDeviceGetMemoryProperties(this->subdevice, &this->local_mem_count, nullptr); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to get memory properties", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +Tile::create_command_objects() +{ + // NOTE: some of these objects will be superfluous, but we're + // hanging on to them just in case they come in handy later + + auto ze_rc = ZE_RESULT_SUCCESS; + + // create command queue + + ze_command_queue_desc_t cmdq_desc{ + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, + this->copyq_group_ord, + 0, // index + 0, // flags + ZE_COMMAND_QUEUE_MODE_DEFAULT, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL + }; + + ze_rc = fn_zeCommandQueueCreate( + this->gpu->context, + this->subdevice, + &cmdq_desc, + &this->command_queue + ); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to create command queue", ze_rc).c_str()); + } + + // create command list + + ze_command_list_desc_t cmdl_desc{ + ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + nullptr, + this->copyq_group_ord, + 0 // flags + }; + + ze_rc = fn_zeCommandListCreate( + this->gpu->context, + this->subdevice, + &cmdl_desc, + &this->command_list + ); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to create command list", ze_rc).c_str()); + } + + ze_rc = fn_zeCommandListCreateImmediate( + this->gpu->context, + this->subdevice, + &cmdq_desc, + &this->immediate_command_list + ); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to create immediate command list", ze_rc).c_str()); + } + + // create fence + + ze_fence_desc_t fence_desc = { + ZE_STRUCTURE_TYPE_FENCE_DESC, + nullptr, + 0 // flags + }; + + ze_rc = fn_zeFenceCreate(this->command_queue, &fence_desc, &this->fence); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to create fence", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +Tile::destroy_command_objects() +{ + auto ze_rc = ZE_RESULT_SUCCESS; + + // destroy command list + + ze_rc = fn_zeCommandListDestroy(this->command_list); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to destroy command list", ze_rc).c_str()); + } + + ze_rc = fn_zeCommandListDestroy(this->immediate_command_list); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to destroy command list", ze_rc).c_str()); + } + + // destroy command queue + + ze_rc = fn_zeCommandQueueSynchronize(this->command_queue, dragon_sec_to_nsec(30)); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to synchronize command queue", ze_rc).c_str()); + } + + ze_rc = fn_zeCommandQueueDestroy(this->command_queue); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to destroy command queue", ze_rc).c_str()); + } + + // destroy fence + + ze_rc = fn_zeFenceDestroy(this->fence); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->gpu->get_errstr("failed to destroy fence", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonGPU_ze::dragonGPU_ze() +{ + this->backend_type = DRAGON_GPU_BACKEND_ZE; + + auto init_flag = (ze_init_flag_t) 0; + auto ze_rc = fn_zeInit(init_flag); + if (ze_rc != ZE_RESULT_SUCCESS) { + throw std::runtime_error("failed to initialize ZE library"); + } + + // a driver corresponds to a collection of physical devices in the + // system accessed by the same Level-Zero driver, so we assume there's + // only one driver for now + auto driver_count = 1u; + + ze_rc = fn_zeDriverGet(&driver_count, &this->driver); + if (ze_rc != ZE_RESULT_SUCCESS || driver_count == 0u) { + throw std::runtime_error("failed to get driver"); + } + + // get devices + + uint32_t device_count = 0; + + ze_rc = fn_zeDeviceGet(this->driver, &device_count, nullptr); + if (ze_rc != ZE_RESULT_SUCCESS || device_count == 0u) { + throw std::runtime_error("failed to get device count"); + } + + this->device.resize(device_count); + + ze_rc = fn_zeDeviceGet(this->driver, &device_count, &this->device[0]); + if (ze_rc != ZE_RESULT_SUCCESS) { + throw std::runtime_error("failed to get devices"); + } + + // get subdevices + + this->subdevice_count.resize(device_count, 0u); + auto tile_count = 0u; + + for (auto i = 0u; i < device_count; ++i) { + auto& device = this->device[i]; + + ze_rc = fn_zeDeviceGetSubDevices(device, &this->subdevice_count[i], nullptr); + if (ze_rc != ZE_RESULT_SUCCESS) { + throw std::runtime_error("failed to get get subdevice count"); + } + + tile_count += this->subdevice_count[i]; + } + + this->subdevice.resize(tile_count); + tile_count = 0u; + + for (auto i = 0u; i < device_count; ++i) { + auto& device = this->device[i]; + + ze_rc = fn_zeDeviceGetSubDevices(device, &this->subdevice_count[i], &this->subdevice[tile_count]); + if (ze_rc != ZE_RESULT_SUCCESS) { + throw std::runtime_error("failed to get get subdevices"); + } + + tile_count += this->subdevice_count[i]; + } + + // TODO: select nearest device + this->device_idx = 0; + + // create a context before initializing the subdevices + auto derr = this->create_context(); + if (derr != DRAGON_SUCCESS) { + throw std::runtime_error("failed to create context"); + } + + // initialize the subdevices (depends on creating context first) + for (auto i = 0u; i < tile_count; ++i) { + auto tile = std::make_shared(this, this->subdevice[i]); + this->tile.push_back(tile); + } + + // create event + + ze_event_pool_desc_t event_pool_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .pNext = nullptr, + .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + .count = 1u + }; + + ze_rc = fn_zeEventPoolCreate(this->context, &event_pool_desc, 1, &this->subdevice[this->device_idx], &this->event_pool); + if (ze_rc != ZE_RESULT_SUCCESS) { + throw std::runtime_error("failed to create event pool"); + } + + ze_event_desc_t event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, + .pNext = nullptr, + .index = 0, + .signal = ZE_EVENT_SCOPE_FLAG_HOST, + .wait = ZE_EVENT_SCOPE_FLAG_HOST + }; + + ze_rc = fn_zeEventCreate(this->event_pool, &event_desc, &this->event); + if (ze_rc != ZE_RESULT_SUCCESS) { + throw std::runtime_error("failed to create event"); + } + +} + +dragonGPU_ze::~dragonGPU_ze() +{ + auto ze_rc = ZE_RESULT_SUCCESS; + + // destroy event pool + + ze_rc = fn_zeEventDestroy(this->event); + if (ze_rc != ZE_RESULT_SUCCESS) { + // call get_errstr just to log the error (assuming debugging is enabled) + this->get_errstr("failed to destroy event", ze_rc).c_str(); + } + + ze_rc = fn_zeEventPoolDestroy(this->event_pool); + if (ze_rc != ZE_RESULT_SUCCESS) { + // call get_errstr just to log the error (assuming debugging is enabled) + this->get_errstr("failed to destroy event pool", ze_rc).c_str(); + } + + // destroy tiles + + for (auto& tile: this->tile) { + tile->destroy_command_objects(); + } + + // destroy context + + ze_rc = fn_zeContextDestroy(this->context); + if (ze_rc != ZE_RESULT_SUCCESS) { + // call get_errstr just to log the error (assuming debugging is enabled) + this->get_errstr("failed to destroy context", ze_rc).c_str(); + } +} + +dragonError_t +dragonGPU_ze::create_context() +{ + ze_context_desc_t ctx_desc{ + ZE_STRUCTURE_TYPE_CONTEXT_DESC, + nullptr, + 0 + }; + + auto ze_rc = fn_zeContextCreate(this->driver, &ctx_desc, &this->context); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to create a GPU context", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::mem_alloc(void **addr, size_t size) +{ + auto my_tile = this->tile[this->device_idx]; + + ze_device_mem_alloc_desc_t mem_alloc_desc = { + ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + nullptr, + 0, // flags + my_tile->get_local_mem_idx() // ordinal + }; + + // TODO: is this a reasonable choice for alignment? + auto alignment = 64ul; + + auto ze_rc = fn_zeMemAllocDevice( + this->context, + &mem_alloc_desc, + size, + alignment, + this->subdevice[this->device_idx], + addr + ); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to allocate device memory", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::mem_free(void *addr) +{ + auto ze_rc = fn_zeMemFree(this->context, addr); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to free device memory", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::get_ipc_handle(void *addr, std::vector& ipc_handle_out) +{ + ze_ipc_mem_handle_t ipc_handle; + + auto ze_rc = fn_zeMemGetIpcHandle(this->context, addr, &ipc_handle); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to get IPC handle", ze_rc).c_str()); + } + + ipc_handle_out.resize(sizeof(ze_ipc_mem_handle_t)); + memcpy(&ipc_handle_out[0], &ipc_handle, ipc_handle_out.size()); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::free_ipc_handle(std::vector& ipc_handle_in) +{ + if (fn_zeMemPutIpcHandle != nullptr) { + ze_ipc_mem_handle_t ipc_handle; + memcpy(&ipc_handle, &ipc_handle_in[0], sizeof(ze_ipc_mem_handle_t)); + + auto ze_rc = fn_zeMemPutIpcHandle(this->context, ipc_handle); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to put IPC handle", ze_rc).c_str()); + } + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::attach(std::vector& ipc_handle_in, void **addr) +{ + ze_ipc_mem_handle_t ipc_handle; + memcpy(&ipc_handle, &ipc_handle_in[0], sizeof(ze_ipc_mem_handle_t)); + + auto ze_rc = fn_zeMemOpenIpcHandle(this->context, this->subdevice[this->device_idx], ipc_handle, 0, addr); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to open IPC handle", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::detach(void *addr) +{ + auto ze_rc = fn_zeMemCloseIpcHandle(this->context, addr); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to close IPC handle", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::copy(void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) +{ + auto my_tile = this->tile[this->device_idx]; + auto my_imm_cmd_list = my_tile->get_immediate_command_list(); + + // silence compiler warning + // (we don't need memcpy_type in this derived class) + (void)memcpy_type; + + // append memcpy + + auto ze_rc = + fn_zeCommandListAppendMemoryCopy( + my_imm_cmd_list, + dst_addr, src_addr, size, + this->event, 0, nullptr + ); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to append memory copy", ze_rc).c_str()); + } + + // synchronize with event + + ze_rc = fn_zeEventHostSynchronize(this->event, dragon_sec_to_nsec(30)); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to synchronize with event", ze_rc).c_str()); + } + + ze_rc = fn_zeEventHostReset(this->event); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to reset event", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragonGPU_ze::memset(void *addr, int val, size_t num_bytes) +{ + auto my_tile = this->tile[this->device_idx]; + auto my_imm_cmd_list = my_tile->get_immediate_command_list(); + auto val_size = 1ul; // always use 1 byte for the pattern size + + // append memory fill + + auto ze_rc = + fn_zeCommandListAppendMemoryFill( + my_imm_cmd_list, + addr, (void *) &val, val_size, num_bytes, + this->event, 0, nullptr + ); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to append memory fill", ze_rc).c_str()); + } + + // synchronize with event + + ze_rc = fn_zeEventHostSynchronize(this->event, dragon_sec_to_nsec(30)); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to synchronize with event", ze_rc).c_str()); + } + + ze_rc = fn_zeEventHostReset(this->event); + if (ze_rc != ZE_RESULT_SUCCESS) { + append_err_return(DRAGON_FAILURE, this->get_errstr("failed to reset event", ze_rc).c_str()); + } + + no_err_return(DRAGON_SUCCESS); +} + +std::string +dragonGPU_ze::get_errstr(const char *event, int ze_rc) +{ + const char *last_err = nullptr; + // TODO: this function doesn't appear to be in the libze_intel_gpu.so + // library on pinoak + //fn_zeDriverGetLastErrorDescription(this->driver, &last_err); + + auto log_str = + std::string(event) + + std::string(": rc=") + std::to_string(ze_rc) + + (last_err ? std::string(", ") + std::string(last_err) : std::string("")); + + if (dragon_gpu_debug) { + fprintf(dragon_gpu_log, "%s\n", log_str.c_str()); + fflush(dragon_gpu_log); + } + + return log_str; +} + +#endif // HAVE_ZE_INCLUDE + diff --git a/src/lib/gpu/ze.hpp b/src/lib/gpu/ze.hpp new file mode 100644 index 0000000..855e388 --- /dev/null +++ b/src/lib/gpu/ze.hpp @@ -0,0 +1,124 @@ +#ifndef HAVE_DRAGON_GPU_ZE_HPP +#define HAVE_DRAGON_GPU_ZE_HPP + +#include "gpu.hpp" +#include "ze_api.h" + +// forward declarations + +class dragonGPU_ze; + +// class definitions + +class Tile { +private: + + ze_device_handle_t subdevice; + ze_command_list_handle_t command_list; + ze_command_list_handle_t immediate_command_list; + ze_command_queue_handle_t command_queue; + ze_fence_handle_t fence; + uint32_t copyq_group_ord; + uint32_t local_mem_idx; + uint32_t local_mem_count; + dragonGPU_ze *gpu; + + dragonError_t + init_ordinals(); + + dragonError_t + create_command_objects(); + +public: + + Tile(dragonGPU_ze *gpu, ze_device_handle_t subdevice); + + dragonError_t + destroy_command_objects(); + + ze_command_list_handle_t + get_command_list() + { + return this->command_list; + } + + ze_command_list_handle_t + get_immediate_command_list() + { + return this->immediate_command_list; + } + + ze_command_queue_handle_t + get_command_queue() + { + return this->command_queue; + } + + ze_fence_handle_t + get_fence() + { + return this->fence; + } + + uint32_t + get_local_mem_idx() + { + // round-robin over local memory ordinals + auto save_idx = this->local_mem_idx; + this->local_mem_idx = (this->local_mem_idx + 1) % this->local_mem_count; + return save_idx; + } +}; + +class dragonGPU_ze final : public dragonGPU { +private: + + ze_driver_handle_t driver; + std::vector device; + std::vector subdevice; + std::vector subdevice_count; + std::vector> tile; + ze_event_pool_handle_t event_pool; + ze_event_handle_t event; + + dragonError_t + create_context(); + +public: + + static constexpr const char *libname{"libze_intel_gpu.so.1"}; + ze_context_handle_t context; + + dragonGPU_ze(); + ~dragonGPU_ze(); + + dragonError_t + mem_alloc(void **addr, size_t size) override; + + dragonError_t + mem_free(void *addr) override; + + dragonError_t + get_ipc_handle(void *addr, std::vector& ipc_handle) override; + + dragonError_t + free_ipc_handle(std::vector& ipc_handle) override; + + dragonError_t + attach(std::vector& ipc_handle, void **addr) override; + + dragonError_t + detach(void *addr) override; + + dragonError_t + copy(void *dst_addr, const void *src_addr, size_t size, dragonGPUMemcpyType_t memcpy_type) override; + + dragonError_t + memset(void *addr, int val, size_t num_bytes) override; + + std::string + get_errstr(const char *event, int ze_rc) override; +}; + +#endif // HAVE_DRAGON_GPU_ZE_HPP + diff --git a/src/lib/hashtable.c b/src/lib/hashtable.c index d48695d..e97b038 100644 --- a/src/lib/hashtable.c +++ b/src/lib/hashtable.c @@ -131,6 +131,7 @@ _copy_out(const dragonHashtable_t* ht, char* dest, const char* source, const uin no_err_return(DRAGON_SUCCESS); } +#ifdef HT_DEBUG static void _strcat_key(char* destination, dragonHashtable_t* ht, const char* key) { char key_str[80]; @@ -145,7 +146,6 @@ _strcat_key(char* destination, dragonHashtable_t* ht, const char* key) { } } -#ifdef HT_DEBUG static void _print_key(dragonHashtable_t* ht, char* key) { char key_str[80]; @@ -208,134 +208,73 @@ _print_chain(dragonHashtable_t* ht, uint64_t idx) { #endif static dragonError_t -_hashtable_add(dragonHashtable_t* ht, const char* key, const char* value, bool replace) -{ +_rehash(dragonHashtable_t* ht) { + dragonError_t err; + if (ht == NULL) err_return(DRAGON_HASHTABLE_NULL_POINTER,"The dragonHashtable handle is NULL."); - if (key == NULL) - err_return(DRAGON_HASHTABLE_NULL_POINTER,"The key is NULL."); - - if (value == NULL) - err_return(DRAGON_HASHTABLE_NULL_POINTER,"The value is NULL."); - - if (!replace && (*ht->header.count_ptr >= ht->header.num_slots/2)) - err_return(DRAGON_HASHTABLE_FULL, "Hashtable is full."); - - _check_armor(ht); - - uint64_t idx = _hash(key, ht->header.key_len) % ht->header.num_slots; - uint64_t entry_len = (ht->header.key_len + ht->header.value_len)*sizeof(uint64_t); - uint64_t loc = max_uint64; - char* key_ptr = NULL; - char* value_ptr = NULL; - bool searching = true; - dragonError_t bit_rc; - dragonError_t rc; - key_ptr = ht->slots + entry_len * idx; - - while (searching) { - unsigned char allocated; - unsigned char placeholder; - - bit_rc = dragon_bitset_get(&ht->allocated, idx, &allocated); - - if (bit_rc != DRAGON_SUCCESS) { - append_err_return(bit_rc, "Could not add entry into hashtable."); - } + uint64_t num_kvs = *ht->header.num_kvs; + char* copy = malloc(entry_len * num_kvs); + if (copy == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate memory for rehash of hashtable."); - if (!allocated) { - searching = false; - } else { - bit_rc = dragon_bitset_get(&ht->placeholder, idx, &placeholder); + char* current = copy; + uint64_t num_copied = 0; + uint64_t idx = 0; + unsigned char allocated; + unsigned char placeholder; + char* entry_ptr; + char* key; + char* value; - if (bit_rc != DRAGON_SUCCESS) { - append_err_return(bit_rc, "Could not add entry into hashtable."); - } + while (num_copied < num_kvs) { + if (idx > ht->header.num_slots) + err_return(DRAGON_FAILURE, "The rehash did not find the expected number of key/value pairs."); - if ((placeholder == 1) && (loc == max_uint64)) { - loc = idx; - } else { - // It is not a placeholder and it is allocated so see if keys are equal - if (_keys_equal(key_ptr, key, ht->header.key_len)) { - // already in the hashtable - if (replace) { - // replace it - value_ptr = key_ptr + ht->header.key_len * sizeof(uint64_t); - rc = _copy_in(ht, value_ptr, value, ht->header.value_len); - if (rc != DRAGON_SUCCESS) - append_err_return(rc, "There was an error on copy."); - no_err_return(DRAGON_SUCCESS); - } + err = dragon_bitset_get(&ht->allocated, idx, &allocated); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not rehash entry in hashtable."); - // otherwise, replace was false so return an error - char err_str[200]; - char key_str[80]; - strcpy(key_str,""); - _strcat_key(key_str, ht, key); - snprintf(err_str, 200, "Duplicate key detected for key %s", key_str); - err_return(DRAGON_HASHTABLE_DUPLICATE_KEY, err_str); - } - } - } + err = dragon_bitset_get(&ht->placeholder, idx, &placeholder); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not rehash entry in hashtable."); - if (searching) { - // advance idx mod length of the slots array avoiding multiplication in the loop. - idx = idx + 1; - if (idx == ht->header.num_slots) { - idx = 0; - key_ptr = ht->slots; - } else { - key_ptr = key_ptr + entry_len; - } + if (allocated && !placeholder) { + entry_ptr = ht->slots + entry_len * idx; + err = _copy_out(ht, current, entry_ptr, entry_len); + current += entry_len; + num_copied += 1; } - } - if (loc == max_uint64) { - loc = idx; + idx += 1; } - /* if replacing and it was not found, then allow it to be added as - long as there is room */ - if (*ht->header.count_ptr >= ht->header.num_slots/2) - err_return(DRAGON_HASHTABLE_FULL, "Hashtable is full."); + err = dragon_bitset_clear(&ht->allocated); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not clear the allocated set."); - key_ptr = ht->slots + entry_len * loc; - value_ptr = key_ptr + ht->header.key_len * sizeof(uint64_t); + err = dragon_bitset_clear(&ht->placeholder); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not clear the placeholder set."); - rc = _copy_in(ht, key_ptr, key, ht->header.key_len); - if (rc != DRAGON_SUCCESS) - append_err_return(rc, "There was an error on copy."); + *ht->header.num_placeholders = 0; + *ht->header.num_kvs = 0; - rc = _copy_in(ht, value_ptr, value, ht->header.value_len); - if (rc != DRAGON_SUCCESS) - append_err_return(rc, "There was an error on copy."); + key = copy; + value = key + ht->header.key_len * sizeof(uint64_t); - /* mark the location allocated */ - bit_rc = dragon_bitset_set(&ht->allocated, loc); + for (uint64_t k=0;kplaceholder, loc); - - if (bit_rc != DRAGON_SUCCESS) { - append_err_return(bit_rc, "Could not add entry into hashtable."); - } - -#ifdef HT_DEBUG - if (len > 0) { - printf("<++++++++++++++++++++++ The chain length was %lu during add. Here are keys\n", len); - _print_chain(ht, start_idx); - } -#endif - - *ht->header.count_ptr = *ht->header.count_ptr + 1; - - _check_armor(ht); + free(copy); no_err_return(DRAGON_SUCCESS); } @@ -368,13 +307,11 @@ dragon_hashtable_size(const uint64_t max_entries, const uint64_t key_len, const if (size == NULL) err_return(DRAGON_HASHTABLE_NULL_POINTER,"The size pointer was NULL."); - if (key_len%sizeof(uint64_t) != 0) { + if (key_len%sizeof(uint64_t) != 0) err_return(DRAGON_HASHTABLE_KEY_SIZE_ERROR,"The key length must be a multiple of 8 bytes."); - } - if (value_len%sizeof(uint64_t) != 0) { + if (value_len%sizeof(uint64_t) != 0) err_return(DRAGON_HASHTABLE_VALUE_SIZE_ERROR,"The value length must be a multiple of 8 bytes."); - } size_t bitset_size; @@ -386,9 +323,8 @@ dragon_hashtable_size(const uint64_t max_entries, const uint64_t key_len, const bitset_size * DRAGON_HASHTABLE_BITSET_COUNT + (key_len + value_len) * num_slots; - if (*size > thirty_two_gb) { + if (*size > thirty_two_gb) err_return(DRAGON_HASHTABLE_TOO_BIG,"The hashtable would be too big."); - } no_err_return(DRAGON_SUCCESS); } @@ -449,6 +385,11 @@ dragon_hashtable_init(char* ptr, dragonHashtable_t* ht, const uint64_t max_entri *ui_ptr = 0; blob_ptr += sizeof(uint64_t); + // Set the count of placeholders in the hashtable. + ui_ptr = (uint64_t*) blob_ptr; + *ui_ptr = 0; + blob_ptr += sizeof(uint64_t); + // Set the key_len ui_ptr = (uint64_t*) blob_ptr; *ui_ptr = key_len / sizeof(uint64_t); @@ -520,6 +461,11 @@ dragon_hashtable_destroy(dragonHashtable_t* ht) if (derr != DRAGON_SUCCESS) append_err_return(derr, "Could not destroy hashtable"); + /* Must be reset to allow using same space later. */ + *ht->header.armor1 = 0; + *ht->header.armor2 = 0; + *ht->header.armor3 = 0; + no_err_return(DRAGON_SUCCESS); } @@ -555,7 +501,12 @@ dragon_hashtable_attach(char* ptr, dragonHashtable_t* ht) // Get the pointer to the count of entries in the hashtable. ui_ptr = (uint64_t*) blob_ptr; - ht->header.count_ptr = ui_ptr; + ht->header.num_kvs = ui_ptr; + blob_ptr += sizeof(uint64_t); + + // Store the number of placeholder values in the table. + ui_ptr = (uint64_t*) blob_ptr; + ht->header.num_placeholders = ui_ptr; blob_ptr += sizeof(uint64_t); // Get the key_len @@ -626,7 +577,7 @@ dragon_hashtable_detach(dragonHashtable_t* ht) // Not really necessary, but if there were an illegal access after a // destroy this will cause a segfault. - ht->header.count_ptr = NULL; + ht->header.num_kvs = NULL; ht->slots = NULL; // return success @@ -636,8 +587,10 @@ dragon_hashtable_detach(dragonHashtable_t* ht) /** @brief Add a key, value pair to the hash table. * * This will add a key, value pair to the hash table. If the hash table is full - * then a new pair cannot be added. If the key is already in the hash table then - * this add will be rejected. + * then a new pair cannot be added. The key must not already exist in the hash + * table. No check is made to prevent corruption in this case to keep adding + * entries as fast as possible. If you don't know the key is unique, use replace + * instead. * * @param ht A valid handle to a hash table. * @param key A pointer to a key of length key_len (provided when inited) @@ -647,19 +600,102 @@ dragon_hashtable_detach(dragonHashtable_t* ht) dragonError_t dragon_hashtable_add(dragonHashtable_t* ht, const char* key, const char* value) { - // add with no replace - dragonError_t derr = _hashtable_add(ht, key, value, false); + if (ht == NULL) + err_return(DRAGON_HASHTABLE_NULL_POINTER,"The dragonHashtable handle is NULL."); - if (derr != DRAGON_SUCCESS) - append_err_return(derr, "Could not add key value pair to hashtable."); + if (key == NULL) + err_return(DRAGON_HASHTABLE_NULL_POINTER,"The key is NULL."); + + if (value == NULL) + err_return(DRAGON_HASHTABLE_NULL_POINTER,"The value is NULL."); + + if (*ht->header.num_kvs >= ht->header.num_slots/2) + err_return(DRAGON_HASHTABLE_FULL, "Hashtable is full."); + + _check_armor(ht); + + /* If we have gotten to this point, cleanup by rehashing. */ + if (*ht->header.num_placeholders > ht->header.num_slots/2) + _rehash(ht); + + uint64_t idx = _hash(key, ht->header.key_len) % ht->header.num_slots; + uint64_t start_idx = idx; + + uint64_t entry_len = (ht->header.key_len + ht->header.value_len)*sizeof(uint64_t); + char* key_ptr = NULL; + char* value_ptr = NULL; + bool searching = true; + dragonError_t bit_rc; + dragonError_t rc; + unsigned char allocated = 0; + unsigned char placeholder = 0; + + while (searching) { + bit_rc = dragon_bitset_get(&ht->allocated, idx, &allocated); + if (bit_rc != DRAGON_SUCCESS) + append_err_return(bit_rc, "Could not add entry into hashtable."); + + bit_rc = dragon_bitset_get(&ht->placeholder, idx, &placeholder); + if (bit_rc != DRAGON_SUCCESS) + append_err_return(bit_rc, "Could not add entry into hashtable."); + + if (!allocated || placeholder) + searching = false; + + if (searching) { + // advance idx mod length of the slots array avoiding multiplication in the loop. + idx = (idx + 1) % ht->header.num_slots; + + // check that we have not gone all the way around. This should not happen, but + // if it did we should catch it here. + if (idx == start_idx) + err_return(DRAGON_FAILURE, "There was an error in the hashtable add function."); + } + } + + key_ptr = ht->slots + entry_len * idx; + value_ptr = key_ptr + ht->header.key_len * sizeof(uint64_t); + + rc = _copy_in(ht, key_ptr, key, ht->header.key_len); + if (rc != DRAGON_SUCCESS) + append_err_return(rc, "There was an error on copy."); + + rc = _copy_in(ht, value_ptr, value, ht->header.value_len); + if (rc != DRAGON_SUCCESS) + append_err_return(rc, "There was an error on copy."); + + /* mark the location allocated */ + bit_rc = dragon_bitset_set(&ht->allocated, idx); + if (bit_rc != DRAGON_SUCCESS) + append_err_return(bit_rc, "Could not add entry into hashtable."); + + if (placeholder) { + /* if it was a placeholder, it now is not. */ + bit_rc = dragon_bitset_reset(&ht->placeholder, idx); + if (bit_rc != DRAGON_SUCCESS) + append_err_return(bit_rc, "Could not add entry into hashtable."); + + *ht->header.num_placeholders -= 1; + } + + *ht->header.num_kvs = *ht->header.num_kvs + 1; + +#ifdef HT_DEBUG + if (len > 0) { + printf("<++++++++++++++++++++++ The chain length was %lu during add. Here are keys\n", len); + _print_chain(ht, start_idx); + } +#endif + + _check_armor(ht); no_err_return(DRAGON_SUCCESS); } /** @brief Replace a key, value pair in the hash table. * - * This will replace a key, value pair in the hash table. If the key is not in the hash table, then - * the replace will fail. + * This will replace a key, value pair in the hash table if it exists and add it + * otherwise. * * @param ht A valid handle to a hash table. * @param key A pointer to a key of length key_len (provided when inited) @@ -669,8 +705,10 @@ dragon_hashtable_add(dragonHashtable_t* ht, const char* key, const char* value) dragonError_t dragon_hashtable_replace(dragonHashtable_t* ht, const char* key, const char* value) { - // add with replace - dragonError_t derr = _hashtable_add(ht, key, value, true); + // we don't care if it was actually there or not. + dragon_hashtable_remove(ht, key); + + dragonError_t derr = dragon_hashtable_add(ht, key, value); if (derr != DRAGON_SUCCESS) append_err_return(derr, "Could not add key value pair to hashtable."); @@ -698,83 +736,78 @@ dragon_hashtable_remove(dragonHashtable_t* ht, const char* key) _check_armor(ht); + /* If we have gotten to this point, cleanup by rehashing. */ + if (*ht->header.num_placeholders > ht->header.num_slots/2) + _rehash(ht); + uint64_t idx = _hash(key, ht->header.key_len) % ht->header.num_slots; + uint64_t start_idx = idx; uint64_t entry_len = (ht->header.key_len + ht->header.value_len) * sizeof(uint64_t); char* key_ptr = NULL; bool searching = true; - key_ptr = ht->slots + entry_len * idx; while (searching) { unsigned char allocated; unsigned char placeholder; dragonError_t bit_rc = dragon_bitset_get(&ht->allocated, idx, &allocated); - - if (bit_rc != DRAGON_SUCCESS) { + if (bit_rc != DRAGON_SUCCESS) append_err_return(bit_rc, "Unable to remove hashtable entry."); - } if (!allocated) { searching = false; } else { bit_rc = dragon_bitset_get(&ht->placeholder, idx, &placeholder); - if (bit_rc != DRAGON_SUCCESS) { + if (bit_rc != DRAGON_SUCCESS) append_err_return(bit_rc, "unable to remove hashtable entry"); - } - if (!placeholder && _keys_equal(key,key_ptr,ht->header.key_len)) { + key_ptr = ht->slots + entry_len * idx; + if (!placeholder && _keys_equal(key, key_ptr, ht->header.key_len)) { uint64_t next_idx = (idx + 1) % ht->header.num_slots; bit_rc = dragon_bitset_get(&ht->allocated, next_idx, &allocated); if (bit_rc != DRAGON_SUCCESS) { - append_err_return(bit_rc, "Unable to remove hashtable entry."); } + append_err_return(bit_rc, "Unable to remove hashtable entry."); + } if (allocated) { // if entry to the right is allocated, then we don't // want to break the chain so make this entry a placeholder bit_rc = dragon_bitset_set(&ht->placeholder, idx); - if (bit_rc != DRAGON_SUCCESS) { + if (bit_rc != DRAGON_SUCCESS) append_err_return(bit_rc, "Unable to remove hashtable entry."); - } + + *ht->header.num_placeholders += 1; + } else { // Mark this entry as not allocated because it was removed // and the entry to the right is not allocated so it is // the end of a chain if one exists. bit_rc = dragon_bitset_reset(&ht->allocated, idx); - if (bit_rc != DRAGON_SUCCESS) { - append_err_return(bit_rc, "Unable to remove hashtable entry."); - } + if (bit_rc != DRAGON_SUCCESS) + append_err_return(bit_rc, "Unable to remove hashtable entry."); // if the entry to the right is not allocated, then // we'll make all placeholders to the left of it // not allocated too since we are at the end of a chain. - uint64_t prev_idx = idx; + uint64_t prev_idx = ((int64_t)idx - 1) % ht->header.num_slots; bool moving_left = true; - if (prev_idx == 0) { - prev_idx = ht->header.num_slots-1; - } else { - prev_idx -= 1; - } + while (moving_left) { bit_rc = dragon_bitset_get(&ht->placeholder, prev_idx, &placeholder); - if (bit_rc != DRAGON_SUCCESS) { + if (bit_rc != DRAGON_SUCCESS) append_err_return(bit_rc, "Unable to remove hashtable entry."); - } if (placeholder) { bit_rc = dragon_bitset_reset(&ht->placeholder, prev_idx); - if (bit_rc != DRAGON_SUCCESS) { + if (bit_rc != DRAGON_SUCCESS) append_err_return(bit_rc, "Unable to remove hashtable entry."); - } + + *ht->header.num_placeholders -= 1; bit_rc = dragon_bitset_reset(&ht->allocated, prev_idx); - if (bit_rc != DRAGON_SUCCESS) { + if (bit_rc != DRAGON_SUCCESS) append_err_return(bit_rc, "Unable to remove hashtable entry."); - } - if (prev_idx == 0) { - prev_idx = ht->header.num_slots-1; - } else { - prev_idx -= 1; - } + prev_idx = ((int64_t)prev_idx - 1) % ht->header.num_slots; } else { moving_left = false; @@ -782,7 +815,7 @@ dragon_hashtable_remove(dragonHashtable_t* ht, const char* key) } } - *(ht->header.count_ptr) -= 1; + *(ht->header.num_kvs) -= 1; #ifdef HT_DEBUG if (idx - start_idx > 0) { @@ -798,14 +831,14 @@ dragon_hashtable_remove(dragonHashtable_t* ht, const char* key) } } - // advance idx mod length of the slots array avoiding multiplication in the loop. - idx = idx + 1; - if (idx == ht->header.num_slots) { - idx = 0; - key_ptr = ht->slots; - } else { - key_ptr = key_ptr + entry_len; - } + // advance idx mod length of the slots array. + idx = (idx + 1) % ht->header.num_slots; + + // check that we have not gone all the way around. This should not happen, but + // if it did we should catch it here. + if (idx == start_idx) + err_return(DRAGON_FAILURE, "There was an error in the hashtable remove function."); + } err_return(DRAGON_HASHTABLE_KEY_NOT_FOUND, "Hashtable key not found."); @@ -837,11 +870,11 @@ dragon_hashtable_get(const dragonHashtable_t* ht, const char* key, char* value) _check_armor(ht); uint64_t idx = _hash(key, ht->header.key_len) % ht->header.num_slots; + uint64_t start_idx = idx; uint64_t entry_len = (ht->header.key_len + ht->header.value_len) * sizeof(uint64_t); char* key_ptr = NULL; char* value_ptr = NULL; bool searching = true; - key_ptr = ht->slots + entry_len * idx; dragonError_t rc; while (searching) { @@ -862,7 +895,8 @@ dragon_hashtable_get(const dragonHashtable_t* ht, const char* key, char* value) append_err_return(bit_rc, "Unable to look up key."); } - if (!placeholder && _keys_equal(key,key_ptr,ht->header.key_len)) { + key_ptr = ht->slots + entry_len * idx; + if (!placeholder && _keys_equal(key, key_ptr, ht->header.key_len)) { value_ptr = key_ptr + ht->header.key_len * sizeof(uint64_t); rc = _copy_out(ht, value, value_ptr, ht->header.value_len); if (rc != DRAGON_SUCCESS) @@ -874,13 +908,12 @@ dragon_hashtable_get(const dragonHashtable_t* ht, const char* key, char* value) } // advance idx mod length of the slots array avoiding multiplication in the loop. - idx = idx + 1; - if (idx == ht->header.num_slots) { - idx = 0; - key_ptr = ht->slots; - } else { - key_ptr = key_ptr + entry_len; - } + idx = (idx + 1) % ht->header.num_slots; + + // check that we have not gone all the way around. This should not happen, but + // if it did we should catch it here. + if (idx == start_idx) + err_return(DRAGON_FAILURE, "There was an error in the hashtable get function."); } err_return(DRAGON_HASHTABLE_KEY_NOT_FOUND, "Hashtable key not found."); @@ -978,9 +1011,9 @@ dragon_hashtable_stats(const dragonHashtable_t* ht, dragonHashtableStats_t* stat if (stats == NULL) err_return(DRAGON_HASHTABLE_NULL_POINTER,"The stats structure pointer is NULL."); - stats->load_factor = *ht->header.count_ptr / ((double)ht->header.num_slots); + stats->load_factor = *ht->header.num_kvs / ((double)ht->header.num_slots); stats->capacity = ht->header.num_slots * (max_load_factor / 100.0); - stats->num_items = *ht->header.count_ptr; + stats->num_items = *ht->header.num_kvs; stats->key_len = ht->header.key_len * sizeof(uint64_t); stats->value_len = ht->header.value_len * sizeof(uint64_t); @@ -1028,10 +1061,10 @@ dragon_hashtable_stats(const dragonHashtable_t* ht, dragonHashtableStats_t* stat max_chain_length = chain_length; } } - if (*ht->header.count_ptr == 0) { + if (*ht->header.num_kvs == 0) { stats->avg_chain_length = 0.0; } else { - stats->avg_chain_length = ((double)total_chain_length) / (*ht->header.count_ptr); + stats->avg_chain_length = ((double)total_chain_length) / (*ht->header.num_kvs); } stats->max_chain_length = max_chain_length; @@ -1094,7 +1127,7 @@ dragon_hashtable_dump_to_fd(FILE* fd, const char* title, const dragonHashtable_t fprintf(fd, "%s%s\n",indent,title); fprintf(fd, "%sNumber of slots: %lu\n",indent,ht->header.num_slots); fprintf(fd, "%sCapacity: %lu\n", indent, stats.capacity); - fprintf(fd, "%sFilled slots: %lu\n", indent, *ht->header.count_ptr); + fprintf(fd, "%sFilled slots: %lu\n", indent, *ht->header.num_kvs); fprintf(fd, "%sLoad Factor: %f\n", indent, stats.load_factor); fprintf(fd, "%sKey length: %lu\n", indent, ht->header.key_len*sizeof(uint64_t)); fprintf(fd, "%sValue length: %lu\n", indent, ht->header.value_len*sizeof(uint64_t)); diff --git a/src/lib/logging.c b/src/lib/logging.c index d672832..ba2bf32 100644 --- a/src/lib/logging.c +++ b/src/lib/logging.c @@ -275,8 +275,9 @@ dragon_logging_serialize(const dragonLoggingDescr_t * logger, dragonLoggingSeria // Copy in serialized channel data *sptr = ch_ser.len; - sptr += sizeof(size_t); + sptr = (dragonULInt *) ((char *)sptr + sizeof(size_t)); memcpy(sptr, ch_ser.data, ch_ser.len); + sptr = (dragonULInt *) ((char *)sptr + ch_ser.len); // Release malloc since we have a copy now err = dragon_channel_serial_free(&ch_ser); @@ -334,10 +335,10 @@ dragon_logging_attach(const dragonLoggingSerial_t * log_ser, dragonLoggingDescr_ dragonULInt * sptr = (dragonULInt*)log_ser->data; dragonChannelSerial_t ch_ser; - ch_ser.len = *(size_t*)sptr; - sptr += sizeof(size_t); - ch_ser.data = (uint8_t*)sptr; - sptr += ch_ser.len; + ch_ser.len = *(size_t *)sptr; + sptr = (dragonULInt *) ((char *)sptr + sizeof(size_t)); + ch_ser.data = (uint8_t *)sptr; + sptr = (dragonULInt *) ((char *)sptr + ch_ser.len); dragonError_t err = dragon_channel_attach(&ch_ser, &(logger->ch)); if (err != DRAGON_SUCCESS) diff --git a/src/lib/managed_memory.c b/src/lib/managed_memory.c index 8e7895a..a67a1a4 100644 --- a/src/lib/managed_memory.c +++ b/src/lib/managed_memory.c @@ -19,6 +19,13 @@ static dragonMap_t * dg_pools = NULL; static dragonMap_t * dg_mallocs = NULL; #define _obtain_manifest_lock(pool) ({\ + if (pool == NULL) {\ + dragonError_t lerr = DRAGON_INVALID_ARGUMENT;\ + char * err_str = _errstr_with_code("manifest lock error code. pool is null", (int)err);\ + err_noreturn(err_str);\ + free(err_str);\ + return lerr;\ + }\ dragonError_t err = dragon_lock(&pool->mlock);\ if (err != DRAGON_SUCCESS) {\ char * err_str = _errstr_with_code("manifest lock error code", (int)err);\ @@ -29,6 +36,13 @@ static dragonMap_t * dg_mallocs = NULL; }) #define _release_manifest_lock(pool) ({\ + if (pool == NULL) {\ + dragonError_t lerr = DRAGON_INVALID_ARGUMENT;\ + char * err_str = _errstr_with_code("manifest lock error code. pool is null", (int)err);\ + err_noreturn(err_str);\ + free(err_str);\ + return lerr;\ + }\ dragonError_t err = dragon_unlock(&pool->mlock);\ if (err != DRAGON_SUCCESS) {\ char * err_str = _errstr_with_code("manifest unlock error code", (int)err);\ @@ -89,7 +103,7 @@ _pool_from_descr(const dragonMemoryPoolDescr_t * pool_descr, dragonMemoryPool_t err_return(DRAGON_INVALID_ARGUMENT, "invalid pool descriptor"); /* find the entry in our pool map for this descriptor */ - dragonError_t err = dragon_umap_getitem(dg_pools, pool_descr->_idx, (void *)pool); + dragonError_t err = dragon_umap_getitem_multikey(dg_pools, pool_descr->_rt_idx, pool_descr->_idx, (void *)pool); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to find item in pools umap"); @@ -97,18 +111,19 @@ _pool_from_descr(const dragonMemoryPoolDescr_t * pool_descr, dragonMemoryPool_t } static dragonError_t - _pool_descr_from_m_uid(const dragonM_UID_t m_uid, dragonMemoryPoolDescr_t * pool_descr) + _pool_descr_from_uids(const dragonRT_UID_t rt_uid, const dragonM_UID_t m_uid, dragonMemoryPoolDescr_t * pool_descr) { - if (pool_descr == NULL) + if (pool_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "invalid pool descriptor"); /* find the entry in our pool map for this descriptor */ dragonMemoryPool_t * pool; - dragonError_t err = dragon_umap_getitem(dg_pools, m_uid, (void *)&pool); + dragonError_t err = dragon_umap_getitem_multikey(dg_pools, rt_uid, m_uid, (void *)&pool); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to find item in pools umap"); /* update the descriptor with the m_uid key and note this cannot be original */ + pool_descr->_rt_idx = rt_uid; pool_descr->_idx = m_uid; pool_descr->_original = 0; @@ -138,7 +153,7 @@ _mem_from_descr(const dragonMemoryDescr_t * mem_descr, dragonMemory_t ** mem) static dragonError_t _add_pool_umap_entry(dragonMemoryPoolDescr_t * pool_descr, dragonMemoryPool_t * pool, - dragonM_UID_t m_uid) + dragonRT_UID_t rt_uid, dragonM_UID_t m_uid) { dragonError_t err; @@ -154,12 +169,13 @@ _add_pool_umap_entry(dragonMemoryPoolDescr_t * pool_descr, dragonMemoryPool_t * } } - err = dragon_umap_additem(dg_pools, m_uid, pool); + err = dragon_umap_additem_multikey(dg_pools, rt_uid, m_uid, pool); if (err != DRAGON_SUCCESS) { append_err_return(err, "failed to insert item into pools umap"); } /* store the m_uid as the key in the descriptor */ + pool_descr->_rt_idx = rt_uid; pool_descr->_idx = m_uid; /* Default _original to 0 */ @@ -253,9 +269,10 @@ _determine_pool_allocation_size(dragonMemoryPool_t * pool, dragonMemoryPoolAttr_ three fields, the pre_allocs the filenames, and the manifest_table. All fields within the header are 8 byte fields so it will have the same size as pointers to each value in the dragonMemoryPoolHeader_t structure */ + size_t lock_size = dragon_lock_size(attr->lock_type); size_t fixed_header_size = sizeof(dragonMemoryPoolHeader_t) - sizeof(void*) * 3; - attr->manifest_allocated_size = fixed_header_size + attr->npre_allocs * sizeof(size_t) + + attr->manifest_allocated_size = lock_size + fixed_header_size + attr->npre_allocs * sizeof(size_t) + (attr->n_segments + 1) * DRAGON_MEMORY_MAX_FILE_NAME_LENGTH + hashtable_size; /* for the requested allocation size, determine @@ -1092,7 +1109,6 @@ dragonError_t dragon_memory_pool_create(dragonMemoryPoolDescr_t * pool_descr, const size_t bytes, const char * base_name, const dragonM_UID_t m_uid, const dragonMemoryPoolAttr_t * attr) { - dragonError_t err; if (pool_descr == NULL) @@ -1102,6 +1118,7 @@ dragon_memory_pool_create(dragonMemoryPoolDescr_t * pool_descr, const size_t byt the special value 0 to help detect SOME failures to check the return code in user code. It is not possible to catch all failures. */ pool_descr->_idx = 0UL; + pool_descr->_rt_idx = 0UL; if (base_name == NULL || (strlen(base_name) == 0)) err_return(DRAGON_INVALID_ARGUMENT, "invalid base_name"); @@ -1147,6 +1164,9 @@ dragon_memory_pool_create(dragonMemoryPoolDescr_t * pool_descr, const size_t byt err_return(DRAGON_INTERNAL_MALLOC_FAIL, "cannot allocate new pool object"); } + /* set flag indicating that this pool is hosted by the current runtime */ + pool->runtime_is_local = true; + /* determine size of the pool based on the requested number of bytes */ uint32_t max_block_power, min_block_power, segment_max_block_power; size_t required_size; @@ -1206,8 +1226,10 @@ dragon_memory_pool_create(dragonMemoryPoolDescr_t * pool_descr, const size_t byt append_err_return(err, "cannot instantiate heap managers"); } + dragonRT_UID_t rt_uid = dragon_get_local_rt_uid(); + /* create the umap entry for the descriptor */ - err = _add_pool_umap_entry(pool_descr, pool, m_uid); + err = _add_pool_umap_entry(pool_descr, pool, rt_uid, m_uid); if (err != DRAGON_SUCCESS) { _free_pool(pool, &def_attr); free(pool); @@ -1254,14 +1276,16 @@ dragon_memory_pool_destroy(dragonMemoryPoolDescr_t * pool_descr) err = _free_pool(pool, &attrs); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to release pool resources"); + err = dragon_memory_attr_destroy(&attrs); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to destroy the attributes for this pool"); /* delete the entry from the umap */ - err = dragon_umap_delitem(dg_pools, pool_descr->_idx); + err = dragon_umap_delitem_multikey(dg_pools, pool_descr->_rt_idx, pool_descr->_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to delete item in pools umap"); + pool_descr->_rt_idx = 0UL; pool_descr->_idx = 0UL; pool_descr->_original = 0; @@ -1273,6 +1297,7 @@ dragon_memory_pool_destroy(dragonMemoryPoolDescr_t * pool_descr) free(pool->mname); free(pool); + no_err_return(DRAGON_SUCCESS); } @@ -1309,6 +1334,56 @@ dragon_memory_pool_get_hostid(dragonMemoryPoolDescr_t * pool_descr, dragonULInt no_err_return(DRAGON_SUCCESS); } +/** + * @brief Determine if a pool is hosted in the local runtime. + * + * @param pool_descr is a valid pool descriptor for the pool in question. + * @param runtime_is_local is a boolean indicating if the pool is hosted in the local runtime. + * + * @return DRAGON_SUCCESS or an error code. + */ +dragonError_t +dragon_memory_pool_runtime_is_local(dragonMemoryPoolDescr_t *pool_descr, bool *runtime_is_local) +{ + dragonMemoryPool_t *pool = NULL; + + dragonError_t err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid pool descriptor"); + + *runtime_is_local = pool->runtime_is_local; + + return err; +} + +/** + * @brief Get the runtime unique id (rt_uid) for a pool. + * + * @param pool_descr is a valid pool descriptor for the pool in question. + * @param rt_uid is the unique value used to identify a runtime. The value is composed + * of two IP addresses: the internet IP address of the login node of the system hosting + * the runtime, and the intranet IP address of the head node for the runtime. + * + * @return DRAGON_SUCCESS or an error code. + */ +dragonError_t +dragon_memory_pool_get_rt_uid(dragonMemoryPoolDescr_t *pool_descr, dragonRT_UID_t *rt_uid) +{ + dragonMemoryPool_t *pool = NULL; + + dragonError_t err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid pool descriptor"); + + if (pool->runtime_is_local) { + *rt_uid = dragon_get_local_rt_uid(); + } else { + *rt_uid = pool->remote.rt_uid; + } + + no_err_return(DRAGON_SUCCESS); +} + /** * @brief Get meta information about a memory pool. * @@ -1335,7 +1410,7 @@ dragon_memory_pool_get_uid_fname(const dragonMemoryPoolSerial_t * pool_ser, drag if (filename != NULL) { dragonULInt * ptr = (dragonULInt*)pool_ser->data; - ptr += 4; // skip id, host_id, mem_type, manifest_len + ptr += 5; // skip id, host_id, runtime ip addrs, mem_type, manifest_len *filename = strdup((char*)ptr); } @@ -1386,6 +1461,7 @@ dragon_memory_pool_descr_clone(dragonMemoryPoolDescr_t * newpool_descr, const dr append_err_return(err, "invalid pool descriptor"); /* update the new one */ + newpool_descr->_rt_idx = oldpool_descr->_rt_idx; newpool_descr->_idx = oldpool_descr->_idx; newpool_descr->_original = 0; @@ -1479,6 +1555,14 @@ dragon_memory_pool_serialize(dragonMemoryPoolSerial_t * pool_ser, const dragonMe *ptr = pool->remote.hostid; ptr++; + /* Copy the runtime unique ID */ + dragonULInt rt_uid; + if (pool->runtime_is_local) + *ptr = rt_uid = dragon_get_local_rt_uid(); + else + *ptr = rt_uid = pool->remote.rt_uid; + ptr++; + /* Copy memory type (SHM, file, etc) */ if (local_pool) *ptr = *pool->header.mem_type; @@ -1499,6 +1583,7 @@ dragon_memory_pool_serialize(dragonMemoryPoolSerial_t * pool_ser, const dragonMe no_err_return(DRAGON_SUCCESS); } + /** * @brief Attach to a pool using a serialied descriptor. * @@ -1516,6 +1601,7 @@ dragonError_t dragon_memory_pool_attach(dragonMemoryPoolDescr_t * pool_descr, const dragonMemoryPoolSerial_t * pool_ser) { bool local_pool = true; + bool runtime_is_local; /* Check args are not null. */ if (pool_descr == NULL) @@ -1537,8 +1623,17 @@ dragon_memory_pool_attach(dragonMemoryPoolDescr_t * pool_descr, const dragonMemo dragonM_UID_t m_uid = *ptr; ptr++; + /* Check if the local dragon_host_id and runtime ip addrs match the pool's values */ + dragonULInt local_host_id = dragon_host_id(); + dragonULInt host_id = *ptr; + ptr++; + + dragonULInt local_rt_uid = dragon_get_local_rt_uid(); + dragonULInt rt_uid = *ptr; + ptr++; + /* check if we already have attached to this pool, if so we will just use that */ - dragonError_t err = _pool_descr_from_m_uid(m_uid, pool_descr); + dragonError_t err = _pool_descr_from_uids(rt_uid, m_uid, pool_descr); if (err == DRAGON_SUCCESS) { dragonMemoryPool_t * pool; _pool_from_descr(pool_descr, &pool); @@ -1546,19 +1641,23 @@ dragon_memory_pool_attach(dragonMemoryPoolDescr_t * pool_descr, const dragonMemo no_err_return(DRAGON_SUCCESS); } - /* Validate dragon_host_id matches pool's host_id */ - dragonULInt local_host_id = dragon_host_id(); - dragonULInt host_id = *ptr; - ptr++; - /* Allocate a new pool, open the manifest file, and map it into the pool header */ dragonMemoryPool_t * pool = (dragonMemoryPool_t*)malloc(sizeof(dragonMemoryPool_t)); if (pool == NULL) err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate internal pool structure."); - /* If this is a non-local pool we are attaching to, then we set a flag. */ - if (local_host_id != host_id) + if (local_rt_uid != rt_uid) + runtime_is_local = false; + else + runtime_is_local = true; + + /* If this is a non-local pool we are attaching to, then we set a flag and return. */ + if (local_host_id != host_id || !runtime_is_local) local_pool = false; + else + local_pool = true; + + pool->runtime_is_local = runtime_is_local; /* Grab the memory storage type */ dragonULInt mem_type = *ptr; @@ -1640,13 +1739,14 @@ dragon_memory_pool_attach(dragonMemoryPoolDescr_t * pool_descr, const dragonMemo } else { pool->local_dptr = NULL; pool->remote.hostid = host_id; + pool->remote.rt_uid = rt_uid; pool->remote.m_uid = m_uid; pool->remote.mem_type = mem_type; pool->remote.manifest_len = manifest_len; } /* Add entry into pool umap updating pool descriptor's idx */ - err = _add_pool_umap_entry(pool_descr, pool, m_uid); + err = _add_pool_umap_entry(pool_descr, pool, rt_uid, m_uid); if (err != DRAGON_SUCCESS) { free(pool); @@ -1691,7 +1791,7 @@ dragon_memory_pool_attach_from_env(dragonMemoryPoolDescr_t * pool_descr, const c err_return(DRAGON_INVALID_ARGUMENT, err_str); } - pool_ser.data = dragon_base64_decode(encoded_pool_str, strlen(encoded_pool_str), &pool_ser.len); + pool_ser.data = dragon_base64_decode(encoded_pool_str, &pool_ser.len); dragonError_t err = dragon_memory_pool_attach(pool_descr, &pool_ser); if (err != DRAGON_SUCCESS) @@ -1718,14 +1818,8 @@ dragonError_t dragon_memory_pool_attach_default(dragonMemoryPoolDescr_t* pool) { dragonError_t err; - char* pool_str; - - pool_str = getenv(DRAGON_DEFAULT_PD_VAR); - if (pool_str == NULL) - err_return(DRAGON_INVALID_OPERATION, "Called dragon_get_default_pool with no default pool set in environment."); - - err = dragon_memory_pool_attach_from_env(pool, pool_str); + err = dragon_memory_pool_attach_from_env(pool, DRAGON_DEFAULT_PD_VAR); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not attach to default memory pool."); @@ -1792,10 +1886,11 @@ dragon_memory_pool_detach(dragonMemoryPoolDescr_t * pool_descr) } /* Remove from umap */ - err = dragon_umap_delitem(dg_pools, pool_descr->_idx); + err = dragon_umap_delitem_multikey(dg_pools, pool_descr->_rt_idx, pool_descr->_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to delete item in pools umap"); + pool_descr->_rt_idx = 0UL; pool_descr->_idx = 0UL; pool_descr->_original = 0; @@ -1832,6 +1927,159 @@ dragon_memory_pool_serial_free(dragonMemoryPoolSerial_t * pool_ser) no_err_return(DRAGON_SUCCESS); } +/** + * @brief Get the muid from a pool. + * + * Especially when attaching a pool, the muid isn't always known + * apriori. This call can be used to discover the muid of any + * pool. + * + * @param pool is a pool descriptor + * + * @param muid is a pointer to space to receive the muid + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ +dragonError_t +dragon_memory_pool_muid(dragonMemoryPoolDescr_t* pool_descr, dragonULInt* muid) +{ + if (pool_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "pool descriptor is NULL"); + + if (muid == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "muid is NULL"); + + /* Get the pool from descriptor */ + dragonMemoryPool_t * pool; + dragonError_t err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid pool descriptor"); + + if (pool->local_dptr != NULL) + /* It is local */ + *muid = *pool->header.m_uid; + else /* non-local */ + *muid = pool->remote.m_uid; + + no_err_return(DRAGON_SUCCESS); +} + +/** + * @brief Get the free space in the pool. + * + * Return the amount of free space. + * + * @param pool is a pool descriptor + * + * @param free_size is a pointer to space to receive the free size in bytes. + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ +dragonError_t +dragon_memory_pool_get_free_size(dragonMemoryPoolDescr_t* pool_descr, uint64_t* free_size) { + + dragonMemoryPool_t * pool; + dragonError_t err; + dragonHeapStats_t stats; + + if (pool_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "pool descriptor is NULL"); + + if (free_size == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "free_size is NULL"); + + /* Get the pool from descriptor */ + err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid pool descriptor"); + + err = dragon_heap_get_stats(&pool->heap.mgrs[0], &stats); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get pool stats."); + + *free_size = stats.total_free_space; + + no_err_return(DRAGON_SUCCESS); +} + +/** + * @brief Get the total space for the pool. + * + * Return the amount of space for the whole pool, whether + * currently allocated or not. + * + * @param pool is a pool descriptor + * + * @param free_size is a pointer to space to receive the total size in bytes. + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ +dragonError_t +dragon_memory_pool_get_total_size(dragonMemoryPoolDescr_t* pool_descr, uint64_t* total_size) { + + dragonMemoryPool_t * pool; + dragonError_t err; + dragonHeapStats_t stats; + + if (pool_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "pool descriptor is NULL"); + + if (total_size == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "total_size is NULL"); + + /* Get the pool from descriptor */ + err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid pool descriptor"); + + err = dragon_heap_get_stats(&pool->heap.mgrs[0], &stats); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get pool stats."); + + *total_size = stats.total_size; + + no_err_return(DRAGON_SUCCESS); +} + +/** + * @brief Get the free utilization percent in the pool. + * + * Return the percentage of free space. + * + * @param pool is a pool descriptor + * + * @param free_size is a pointer to space to receive the free size as a double value. + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ + +dragonError_t +dragon_memory_pool_get_utilization_pct(dragonMemoryPoolDescr_t* pool_descr, double* utilization_pct) { + + dragonMemoryPool_t * pool; + dragonError_t err; + dragonHeapStats_t stats; + + if (pool_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "pool descriptor is NULL"); + + if (utilization_pct == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "free_pct is NULL"); + + /* Get the pool from descriptor */ + err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid pool descriptor"); + + err = dragon_heap_get_stats(&pool->heap.mgrs[0], &stats); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get pool stats."); + + *utilization_pct = stats.utilization_pct; + + no_err_return(DRAGON_SUCCESS); +} + /** * @brief Get the maximum length of a serialized memory descriptor * @@ -2011,6 +2259,7 @@ dragon_memory_attach(dragonMemoryDescr_t * mem_descr, const dragonMemorySerial_t /* Get our serialized pool */ dragonMemoryPoolDescr_t pool_descr; + dragonError_t err = dragon_memory_pool_attach(&pool_descr, &pool_ser); if (err != DRAGON_SUCCESS) append_err_return(err, "could not attach to memory pool"); @@ -2103,11 +2352,12 @@ dragon_memory_detach(dragonMemoryDescr_t * mem_descr) dragonError_t dragon_memory_alloc_blocking(dragonMemoryDescr_t * mem_descr, const dragonMemoryPoolDescr_t * pool_descr, const size_t bytes, const timespec_t* timeout) { - size_t alloc_bytes = bytes; - if (mem_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "invalid memory descriptor"); + if (bytes == 0) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot allocate zero bytes."); + /* The _idx should never be zero. It is set below if successully initialized. We'll use the special value 0 to help detect SOME failures to check the return code in user code. It is not possible to catch all failures. */ @@ -2134,15 +2384,7 @@ dragon_memory_alloc_blocking(dragonMemoryDescr_t * mem_descr, const dragonMemory err_return(DRAGON_INTERNAL_MALLOC_FAIL, "cannot allocate new memory object"); } - // A zero byte allocation is needed in channels when attributes are to be sent - // and potentially other entities that require a shared memory descriptor when - // there is no real allocation to make. So don't reject bytes == 0. - if (bytes == 0) - // To avoid special case code for this everywhere (cloning, freeing, etc.) - // we will make a 1 byte allocation, but say that it is zero bytes. - alloc_bytes = 1; - - err = dragon_heap_malloc_blocking(&pool->heap.mgrs[0], alloc_bytes, &mem->local_dptr, timeout); + err = dragon_heap_malloc_blocking(&pool->heap.mgrs[0], bytes, &mem->local_dptr, timeout); if (err != DRAGON_SUCCESS) /* Don't use append_err_return. In hot path */ return err; @@ -2174,6 +2416,7 @@ dragon_memory_alloc_blocking(dragonMemoryDescr_t * mem_descr, const dragonMemory /* store the id from the pool descriptor for this allocation so we can later reverse map back to the pool to get the heap managers for memory frees */ + mem->pool_descr._rt_idx = pool_descr->_rt_idx; mem->pool_descr._idx = pool_descr->_idx; mem->pool_descr._original = 1; @@ -2910,6 +3153,7 @@ dragon_memory_get_alloc_memdescr(dragonMemoryDescr_t * mem_descr, const dragonMe _obtain_manifest_lock(pool); err = _lookup_allocation(pool, type, type_id, mem); _release_manifest_lock(pool); + if (err != DRAGON_SUCCESS) { char err_str[100]; free(mem); @@ -3050,3 +3294,126 @@ dragon_memory_modify_size(dragonMemoryDescr_t * mem_descr, const size_t new_size no_err_return(DRAGON_SUCCESS); } + +/** + * @brief Compute a hash value for a memory allocation + * + * Use the Dragon hash function to compute a hash value + * + * @param mem_descr is the memory descriptor to hash. + * + * @param hash_value is the hash function's value + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ + +dragonError_t +dragon_memory_hash(dragonMemoryDescr_t* mem_descr, dragonULInt* hash_value) +{ + /* Check that the given descriptor points to valid memory */ + dragonMemory_t* mem; + dragonError_t err; + + if (hash_value == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must pass a pointer to a hash_value location to store the result."); + + err = _mem_from_descr(mem_descr, &mem); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid memory descriptor"); + + if (mem->local_dptr == NULL) + err_return(DRAGON_MEMORY_OPERATION_ATTEMPT_ON_NONLOCAL_POOL, "You cannot hash a non-local memory allocation."); + + *hash_value = dragon_hash(mem->local_dptr+mem->offset, mem->bytes); + + no_err_return(DRAGON_SUCCESS); + +} + +/** + * @brief Check for equal contents + * + * Check that two memory allocations have equal contents. + * + * @param mem_descr1 One memory allocation. + * @param mem_descr2 Other memory allocation. + * + * @param result is true if equal and false otherwise. + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ + +dragonError_t +dragon_memory_equal(dragonMemoryDescr_t* mem_descr1, dragonMemoryDescr_t* mem_descr2, bool* result) +{ + /* Check that the given descriptor points to valid memory */ + dragonMemory_t* mem1; + dragonMemory_t* mem2; + dragonError_t err; + + if (result == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must pass a pointer to a result location to store the result."); + + *result = false; + + err = _mem_from_descr(mem_descr1, &mem1); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid memory descriptor"); + + if (mem1->local_dptr == NULL) + err_return(DRAGON_MEMORY_OPERATION_ATTEMPT_ON_NONLOCAL_POOL, "You cannot hash a non-local memory allocation."); + + err = _mem_from_descr(mem_descr2, &mem2); + if (err != DRAGON_SUCCESS) + append_err_return(err, "invalid memory descriptor"); + + if (mem2->local_dptr == NULL) + err_return(DRAGON_MEMORY_OPERATION_ATTEMPT_ON_NONLOCAL_POOL, "You cannot hash a non-local memory allocation."); + + *result = dragon_bytes_equal(mem1->local_dptr + mem1->offset, mem2->local_dptr + mem2->offset, mem1->bytes, mem2->bytes); + + no_err_return(DRAGON_SUCCESS); +} + +/** + * @brief Check for equal contents + * + * Check that two memory allocations have equal contents. + * + * @param mem_descr1 One memory allocation. + * @param mem_descr2 Other memory allocation. + * + * @param result is true if equal and false otherwise. + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ + +dragonError_t +dragon_memory_copy(dragonMemoryDescr_t* from_mem, dragonMemoryDescr_t* to_mem, dragonMemoryPoolDescr_t* to_pool, const timespec_t* timeout) +{ + dragonError_t err; + size_t size; + void* from_ptr; + void* to_ptr; + + err = dragon_memory_get_size(from_mem, &size); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get size of memory."); + + err = dragon_memory_alloc_blocking(to_mem, to_pool, size, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not allocate new memory."); + + err = dragon_memory_get_pointer(from_mem, &from_ptr); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get from memory pointer."); + + err = dragon_memory_get_pointer(to_mem, &to_ptr); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get to memory pointer."); + + memcpy(to_ptr, from_ptr, size); + + no_err_return(DRAGON_SUCCESS); +} + diff --git a/src/lib/message_defs.capnp b/src/lib/message_defs.capnp new file mode 100644 index 0000000..98ac763 --- /dev/null +++ b/src/lib/message_defs.capnp @@ -0,0 +1,191 @@ +@0x89c9f71b7b1aa97e; # unique file ID, generated by `capnp id` + +struct SHCreateProcessLocalChannelDef { + puid @0: UInt64; + respFLI @1: Text; +} + +struct SHCreateProcessLocalChannelResponseDef { + serChannel @0: Text; +} + +struct SHSetKVDef { + key @0: Text; + value @1: Text; + respFLI @2: Text; +} + +struct SHGetKVDef { + key @0: Text; + respFLI @1: Text; +} + +struct SHGetKVLResponseDef { + values @0: List(Text); +} + +struct SHGetKVResponseDef { + value @0: Text; +} + +struct DDCreateDef { + respFLI @0: Text; + args @1: Text; +} + +struct DDRegisterManagerDef { + mainFLI @0: Text; + respFLI @1: Text; +} + +struct DDRegisterManagerResponseDef { + managerID @0: UInt64; + managers @1: List(Text); +} + +struct DDGetRandomManagerDef { + respFLI @0: Text; +} + +struct DDGetRandomManagerResponseDef { + manager @0: Text; +} + +struct DDRegisterClientDef { + respFLI @0: Text; + bufferedRespFLI @1: Text; +} + +struct DDRegisterClientResponseDef { + clientID @0: UInt64; + numManagers @1: UInt64; +} + +struct DDConnectToManagerDef { + clientID @0: UInt64; + managerID @1: UInt64; +} + +struct DDConnectToManagerResponseDef { + manager @0: Text; +} + +struct DDRegisterClientIDDef { + clientID @0: UInt64; + respFLI @1: Text; + bufferedRespFLI @2: Text; +} + +struct DDDestroyDef { + clientID @0: UInt64; + respFLI @1: Text; +} + +struct DDDestroyManagerDef { + respFLI @0: Text; +} + +struct DDPutDef { + clientID @0: UInt64; +} + +struct DDGetDef { + clientID @0: UInt64; +} + +struct DDPopDef { + clientID @0: UInt64; +} + +struct DDContainsDef { + clientID @0: UInt64; +} + +struct DDGetLengthDef { + clientID @0: UInt64; +} + +struct DDGetLengthResponseDef { + length @0: UInt64; +} + +struct DDClearDef { + clientID @0: UInt64; +} + +struct DDGetIteratorDef { + clientID @0: UInt64; +} + +struct DDGetIteratorResponseDef { + iterID @0: UInt64; +} + +struct DDIteratorNextDef { + clientID @0: UInt64; + iterID @1: UInt64; +} + +struct DDKeysDef { + clientID @0: UInt64; +} + +struct DDDeregisterClientDef { + clientID @0: UInt64; + respFLI @1: Text; +} + +struct NoMessageSpecificData { + none @0: Void; +} + +struct ResponseDef { + ref @0: UInt64; + err @1: UInt64; + errInfo @2: Text; +} + +struct MessageDef { + tc @0: UInt64; + tag @1: UInt64; + responseOption: union { + none @2: Void; + value @3: ResponseDef; + } + union { + none @4: NoMessageSpecificData; + shCreateProcessLocalChannel @5: SHCreateProcessLocalChannelDef; + shCreateProcessLocalChannelResponse @6: SHCreateProcessLocalChannelResponseDef; + shPushKVL @7: SHSetKVDef; + shPopKVL @8: SHSetKVDef; + shGetKVL @9: SHGetKVDef; + shGetKVLResponse @10: SHGetKVLResponseDef; + shSetKV @11: SHSetKVDef; + shGetKV @12: SHGetKVDef; + shGetKVResponse @13: SHGetKVResponseDef; + ddRegisterClient @14: DDRegisterClientDef; + ddRegisterClientResponse @15: DDRegisterClientResponseDef; + ddDestroy @16: DDDestroyDef; + ddDestroyManager @17: DDDestroyManagerDef; + ddRegisterManager @18: DDRegisterManagerDef; + ddRegisterClientID @19: DDRegisterClientIDDef; + ddPut @20: DDPutDef; + ddGet @21: DDGetDef; + ddPop @22: DDPopDef; + ddContains @23: DDContainsDef; + ddGetLength @24: DDGetLengthDef; + ddGetLengthResponse @25: DDGetLengthResponseDef; + ddClear @26: DDClearDef; + ddGetIterator @27: DDGetIteratorDef; + ddGetIteratorResponse @28: DDGetIteratorResponseDef; + ddIteratorNext @29: DDIteratorNextDef; + ddKeys @30: DDKeysDef; + ddDeregisterClient @31: DDDeregisterClientDef; + ddCreate @32: DDCreateDef; + ddRegisterManagerResponse @33: DDRegisterManagerResponseDef; + ddConnectToManager @34: DDConnectToManagerDef; + ddConnectToManagerResponse @35: DDConnectToManagerResponseDef; + ddGetRandomManager @36: DDGetRandomManagerDef; + ddGetRandomManagerResponse @37: DDGetRandomManagerResponseDef; + } +} \ No newline at end of file diff --git a/src/lib/message_tcs_to_enum.py b/src/lib/message_tcs_to_enum.py new file mode 100644 index 0000000..432fbd1 --- /dev/null +++ b/src/lib/message_tcs_to_enum.py @@ -0,0 +1,62 @@ + +def main(): + file = open('../dragon/infrastructure/messages.py', 'r') + in_enum = False + enum_file = open('../include/dragon/message_tcs.hpp', "w") + map_file = open('_message_tcs.hpp', "w") + enum_file.write('#ifndef message_tcs_hpp\n') + enum_file.write('#define message_tcs_hpp\n') + enum_file.write('\n') + enum_file.write('/* DO NOT EDIT THIS FILE. */\n') + enum_file.write('/* This file was automatically generated by the messages_tc_to_enum.py */\n') + enum_file.write('/* program. It is a C/C++ duplicate of the typecodes located in */\n') + enum_file.write('/* src/dragon/infrastructure/messages.py */\n') + map_file.write('/* DO NOT EDIT THIS FILE. */\n') + map_file.write('/* This file was automatically generated by the messages_tc_to_enum.py */\n') + map_file.write('/* program. This provides a mapping of typecode to typecode name. It is */\n') + map_file.write('/* meant to be only included in messages.cpp. Lookups on the map can be */\n') + map_file.write('/* can be done by including messages.hpp and calling dragon_msg_tc_name.*/\n') + map_file.write('#include \n') + + map_file.write('\n') + map_file.write('static unordered_map tcMap\n') + map_file.write('{\n') + enum_file.write('\n') + enum_file.write('enum MessageType {\n') + line_to_write = '' + map_line = '' + next_val = 1 + + for line in file: + if line.startswith('class MessageType'): + in_enum = True + elif line.startswith('class'): + break + if in_enum: + lst = line.split() + if len(lst) >= 3 and lst[1] == '=': + if len(line_to_write) > 0: + enum_file.write(f'{line_to_write},\n') + map_file.write(f'{map_line},\n') + line_to_write = '' + map_line = '' + tc = lst[0] + val = lst[2] + if val=='enum.auto()': + val = next_val + next_val+=1 + line_to_write = f' {tc} = {val}' + map_line = ' {' + f'{tc}, "{tc}"' +'}' + + if len(line_to_write) > 0: + enum_file.write(f'{line_to_write}\n') + map_file.write(f'{map_line}\n') + + enum_file.write('};\n') + enum_file.write('\n') + enum_file.write('#endif\n') + map_file.write('};\n') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/lib/messages.cpp b/src/lib/messages.cpp new file mode 100644 index 0000000..f3b0e81 --- /dev/null +++ b/src/lib/messages.cpp @@ -0,0 +1,1713 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "err.h" +#include +#include "_message_tcs.hpp" +#include +#include "shared_lock.h" + +using namespace std; + +static uint64_t sh_tag = 0; + +uint64_t inc_sh_tag() { + uint64_t tmp = sh_tag; + sh_tag+=1; + return tmp; +} + +/********************************************************************************************************/ +/* base message */ + +DragonMsg::DragonMsg(MessageType tc, uint64_t tag) +{ + this->mTC = tc; + this->mTag = tag; +} + +DragonMsg::~DragonMsg() {} + +void +DragonMsg::builder(MessageDef::Builder& msg) +{ + msg.setTc(mTC); + msg.setTag(mTag); +} + +dragonError_t +DragonMsg::send(dragonFLISendHandleDescr_t* sendh, const timespec_t* timeout) +{ + dragonError_t err; + int fd; + + try { + capnp::MallocMessageBuilder message; + MessageDef::Builder msg = message.initRoot(); + this->builder(msg); + + err = dragon_fli_create_writable_fd(sendh, &fd, true, 0, 0, timeout); + if (err != DRAGON_SUCCESS) + err_return(err, "Could not create writable fd to send the message."); + + capnp::writePackedMessageToFd(fd, message); + + close(fd); + + err = dragon_fli_finalize_writable_fd(sendh); + if (err != DRAGON_SUCCESS) + err_return(err, "Could not finalize the fd after sending message."); + + } catch (...) { + err_return(DRAGON_INVALID_OPERATION, "There was an error while attempting to send the message over the fli."); + } + + no_err_return(DRAGON_SUCCESS); +} + +MessageType +DragonMsg::tc() +{ + return mTC; +} + +uint64_t +DragonMsg::tag() +{ + return mTag; +} + +/********************************************************************************************************/ +/* base response message */ + +DragonResponseMsg::DragonResponseMsg(MessageType tc, uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo) : + DragonMsg(tc, tag), mRef(ref), mErr(err), mErrInfo(errInfo) {} + +DragonResponseMsg::~DragonResponseMsg() {} + +uint64_t DragonResponseMsg::ref() +{ + return mRef; +} + +dragonError_t DragonResponseMsg::err() +{ + return mErr; +} + +const char* DragonResponseMsg::errInfo() +{ + return mErrInfo.c_str(); +} + +void +DragonResponseMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + ResponseDef::Builder builder = msg.getResponseOption().initValue(); + builder.setRef(mRef); + builder.setErr(mErr); + builder.setErrInfo(mErrInfo); +} + +/********************************************************************************************************/ +/* local services create process local channel */ + +SHCreateProcessLocalChannel::SHCreateProcessLocalChannel(uint64_t tag, uint64_t puid, const char* respFLI): + DragonMsg(SHCreateProcessLocalChannel::TC, tag), mPUID(puid), mFLI(respFLI) {} + +void +SHCreateProcessLocalChannel::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + SHCreateProcessLocalChannelDef::Builder builder = msg.initShCreateProcessLocalChannel(); + builder.setPuid(this->mPUID); + builder.setRespFLI(this->mFLI); +} + +dragonError_t +SHCreateProcessLocalChannel::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + + SHCreateProcessLocalChannelDef::Reader mReader = reader.getShCreateProcessLocalChannel(); + + (*msg) = new SHCreateProcessLocalChannel( + reader.getTag(), + mReader.getPuid(), + mReader.getRespFLI().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the SHCreateProcessLocalChannel message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* +SHCreateProcessLocalChannel::respFLI() +{ + return mFLI.c_str(); +} + +const uint64_t +SHCreateProcessLocalChannel::puid() +{ + return mPUID; +} + +/********************************************************************************************************/ +/* local services create process local channel response */ + +SHCreateProcessLocalChannelResponse::SHCreateProcessLocalChannelResponse(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, const char* serChannel): + DragonResponseMsg(SHCreateProcessLocalChannelResponse::TC, tag, ref, err, errInfo), mSerChannel(serChannel) {} + +void +SHCreateProcessLocalChannelResponse::builder(MessageDef::Builder& msg) +{ + DragonResponseMsg::builder(msg); + SHCreateProcessLocalChannelResponseDef::Builder builder = msg.initShCreateProcessLocalChannelResponse(); + builder.setSerChannel(mSerChannel); +} + +dragonError_t +SHCreateProcessLocalChannelResponse::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + SHCreateProcessLocalChannelResponseDef::Reader mReader = reader.getShCreateProcessLocalChannelResponse(); + + (*msg) = new SHCreateProcessLocalChannelResponse( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr(), + mReader.getSerChannel().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the SHCreateProcessLocalChannelResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* +SHCreateProcessLocalChannelResponse::serChannel() +{ + return this->mSerChannel.c_str(); +} + +/********************************************************************************************************/ +/* local services Set Key/Value Pair */ + +SHSetKVMsg::SHSetKVMsg(uint64_t tag, const char* key, const char* value, const char* respFLI): + DragonMsg(SHSetKVMsg::TC, tag), mKey(key), mValue(value), mFLI(respFLI) {} + +void SHSetKVMsg::builder(MessageDef::Builder& msg) { + DragonMsg::builder(msg); + SHSetKVDef::Builder builder = msg.initShSetKV(); + builder.setKey(this->mKey); + builder.setValue(this->mValue); + builder.setRespFLI(this->mFLI); +} + +dragonError_t SHSetKVMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) { + try { + SHSetKVDef::Reader mReader = reader.getShSetKV(); + + (*msg) = new SHSetKVMsg( + reader.getTag(), + mReader.getKey().cStr(), + mReader.getValue().cStr(), + mReader.getRespFLI().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the SHSetKV message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* SHSetKVMsg::key() { + return mKey.c_str(); +} + +const char* SHSetKVMsg::value() { + return mValue.c_str(); +} + +const char* SHSetKVMsg::respFLI() { + return mFLI.c_str(); +} + +/********************************************************************************************************/ +/* local services Set Key/Value Pair Response */ + +SHSetKVResponseMsg::SHSetKVResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(SHSetKVResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +SHSetKVResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new SHSetKVResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the SHSetKVResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* local services Get Key/Value Pair */ + +SHGetKVMsg::SHGetKVMsg(uint64_t tag, const char* key, const char* respFLI): + DragonMsg(SHGetKVMsg::TC, tag), mKey(key), mFLI(respFLI) {} + +void SHGetKVMsg::builder(MessageDef::Builder& msg) { + DragonMsg::builder(msg); + SHGetKVDef::Builder builder = msg.initShGetKV(); + builder.setKey(this->mKey); + builder.setRespFLI(this->mFLI); +} + +dragonError_t SHGetKVMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) { + try { + SHGetKVDef::Reader mReader = reader.getShGetKV(); + + (*msg) = new SHGetKVMsg( + reader.getTag(), + mReader.getKey().cStr(), + mReader.getRespFLI().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the SHGetKV message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* SHGetKVMsg::key() { + return mKey.c_str(); +} + +const char* SHGetKVMsg::respFLI() { + return mFLI.c_str(); +} + +/********************************************************************************************************/ +/* local services Get Key/Value Pair Response */ + +SHGetKVResponseMsg::SHGetKVResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, const char* value): + DragonResponseMsg(SHGetKVResponseMsg::TC, tag, ref, err, errInfo), mValue(value) {} + +void +SHGetKVResponseMsg::builder(MessageDef::Builder& msg) +{ + DragonResponseMsg::builder(msg); + SHGetKVResponseDef::Builder builder = msg.initShGetKVResponse(); + builder.setValue(mValue); +} + +dragonError_t +SHGetKVResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + SHGetKVResponseDef::Reader mReader = reader.getShGetKVResponse(); + (*msg) = new SHGetKVResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr(), + mReader.getValue().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the SHGetKVResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* SHGetKVResponseMsg::value() { + return mValue.c_str(); +} + + +/********************************************************************************************************/ +/* ddict register client */ +DDRegisterClientMsg::DDRegisterClientMsg(uint64_t tag, const char* respFLI, const char* bufferedRespFLI) : + DragonMsg(DDRegisterClientMsg::TC, tag), mFLI(respFLI), bFLI(bufferedRespFLI) {} + +void +DDRegisterClientMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDRegisterClientDef::Builder builder = msg.initDdRegisterClient(); + builder.setRespFLI(this->mFLI); + builder.setBufferedRespFLI(this->bFLI); +} + +dragonError_t +DDRegisterClientMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDRegisterClientDef::Reader mReader = reader.getDdRegisterClient(); + + (*msg) = new DDRegisterClientMsg( + reader.getTag(), + mReader.getRespFLI().cStr(), + mReader.getBufferedRespFLI().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the RegisterClient message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* +DDRegisterClientMsg::respFLI() +{ + return mFLI.c_str(); +} + +const char* +DDRegisterClientMsg::bufferedRespFLI() +{ + return bFLI.c_str(); +} + +/********************************************************************************************************/ +/* ddict register client response */ + +DDRegisterClientResponseMsg::DDRegisterClientResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, uint64_t clientID, uint64_t numManagers) : + DragonResponseMsg(DDRegisterClientResponseMsg::TC, tag, ref, err, errInfo), + mClientID(clientID), + mNumManagers(numManagers) {} + +dragonError_t +DDRegisterClientResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + DDRegisterClientResponseDef::Reader mReader = reader.getDdRegisterClientResponse(); + + (*msg) = new DDRegisterClientResponseMsg( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr(), + mReader.getClientID(), + mReader.getNumManagers()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the SHCreateProcessLocalChannelResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDRegisterClientResponseMsg::clientID() +{ + return mClientID; +} + +uint64_t +DDRegisterClientResponseMsg::numManagers() { + return mNumManagers; +} + +void +DDRegisterClientResponseMsg::builder(MessageDef::Builder& msg) +{ + DragonResponseMsg::builder(msg); + DDRegisterClientResponseDef::Builder builder = msg.initDdRegisterClientResponse(); + builder.setClientID(mClientID); + builder.setNumManagers(mNumManagers); +} + + +/********************************************************************************************************/ + + +DDDestroyMsg::DDDestroyMsg(uint64_t tag, const char* respFLI) : + DragonMsg(DDDestroyMsg::TC, tag), mFLI(respFLI) {} + +void +DDDestroyMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDDestroyDef::Builder builder = msg.initDdDestroy(); + builder.setRespFLI(this->mFLI); +} + +dragonError_t +DDDestroyMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + uint64_t tag = reader.getTag(); + + DDDestroyDef::Reader mReader = reader.getDdDestroy(); + + (*msg) = new DDDestroyMsg(tag, mReader.getRespFLI().cStr()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the Destroy message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* +DDDestroyMsg::respFLI() +{ + return mFLI.c_str(); +} + +/********************************************************************************************************/ +DDDestroyResponseMsg::DDDestroyResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDDestroyResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDDestroyResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDDestroyResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDDestroyResponseMsg message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict destroy manager message */ + +DDDestroyManagerMsg::DDDestroyManagerMsg(uint64_t tag, const char* respFLI) : + DragonMsg(DDDestroyManagerMsg::TC, tag), mFLI(respFLI) {} + +void +DDDestroyManagerMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDDestroyManagerDef::Builder builder = msg.initDdDestroyManager(); + builder.setRespFLI(this->mFLI); +} + +dragonError_t +DDDestroyManagerMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + uint64_t tag = reader.getTag(); + + DDDestroyManagerDef::Reader mReader = reader.getDdDestroyManager(); + + (*msg) = new DDDestroyManagerMsg(tag, mReader.getRespFLI().cStr()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the Destroy message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* +DDDestroyManagerMsg::respFLI() +{ + return mFLI.c_str(); +} + +/********************************************************************************************************/ +/* ddict destroy manager response message */ + +DDDestroyManagerResponseMsg::DDDestroyManagerResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDDestroyManagerResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDDestroyManagerResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDDestroyManagerResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDDestroyManagerResponseMsg message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict register manager message */ + +DDRegisterManagerMsg::DDRegisterManagerMsg(uint64_t tag, const char* mainFLI, const char* respFLI) : + DragonMsg(DDRegisterManagerMsg::TC, tag), mMainFLI(mainFLI), mRespFLI(respFLI) {} + +void +DDRegisterManagerMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDRegisterManagerDef::Builder builder = msg.initDdRegisterManager(); + builder.setMainFLI(mMainFLI); + builder.setRespFLI(mRespFLI); +} + +dragonError_t +DDRegisterManagerMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + uint64_t tag = reader.getTag(); + + DDRegisterManagerDef::Reader mReader = reader.getDdRegisterManager(); + + (*msg) = new DDRegisterManagerMsg(tag, mReader.getMainFLI().cStr(), mReader.getRespFLI().cStr()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDRegisterManager message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* +DDRegisterManagerMsg::mainFLI() +{ + return mMainFLI.c_str(); +} + +const char* +DDRegisterManagerMsg::respFLI() +{ + return mRespFLI.c_str(); +} + +/********************************************************************************************************/ +/* ddict register manager response message */ + +DDRegisterManagerResponseMsg::DDRegisterManagerResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDRegisterManagerResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDRegisterManagerResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDRegisterManagerResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDRegisterManagerResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict register client id message */ + +DDRegisterClientIDMsg::DDRegisterClientIDMsg(uint64_t tag, uint64_t clientID, const char* respFLI, const char* bufferedRespFLI) : + DragonMsg(DDRegisterClientIDMsg::TC, tag), mClientID(clientID), mRespFLI(respFLI), mBufferedRespFLI(bufferedRespFLI) {} + +void +DDRegisterClientIDMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDRegisterClientIDDef::Builder builder = msg.initDdRegisterClientID(); + builder.setClientID(mClientID); + builder.setRespFLI(mRespFLI); + builder.setBufferedRespFLI(mBufferedRespFLI); +} + +dragonError_t +DDRegisterClientIDMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + uint64_t tag = reader.getTag(); + + DDRegisterClientIDDef::Reader mReader = reader.getDdRegisterClientID(); + + (*msg) = new DDRegisterClientIDMsg(tag, mReader.getClientID(), mReader.getRespFLI().cStr(), mReader.getBufferedRespFLI().cStr()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDRegisterClientID message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDRegisterClientIDMsg::clientID() { + return mClientID; +} + +const char* +DDRegisterClientIDMsg::respFLI() +{ + return mRespFLI.c_str(); +} + +const char* +DDRegisterClientIDMsg::bufferedRespFLI() +{ + return mBufferedRespFLI.c_str(); +} + +/********************************************************************************************************/ +/* ddict register client id response message */ + +DDRegisterClientIDResponseMsg::DDRegisterClientIDResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDRegisterClientIDResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDRegisterClientIDResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDRegisterClientIDResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDRegisterManagerResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict put message */ + +DDPutMsg::DDPutMsg(uint64_t tag, uint64_t clientID) : + DragonMsg(DDPutMsg::TC, tag), mClientID(clientID) {} + +void +DDPutMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDPutDef::Builder builder = msg.initDdPut(); + builder.setClientID(mClientID); +} + +dragonError_t +DDPutMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDPutDef::Reader mReader = reader.getDdPut(); + + (*msg) = new DDPutMsg(reader.getTag(), mReader.getClientID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDPut message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDPutMsg::clientID() +{ + return mClientID; +} + +/********************************************************************************************************/ +/* ddict put response message */ + +DDPutResponseMsg::DDPutResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDPutResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDPutResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDPutResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDPutResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict get message */ + +DDGetMsg::DDGetMsg(uint64_t tag, uint64_t clientID) : + DragonMsg(DDGetMsg::TC, tag), mClientID(clientID) {} + +void +DDGetMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDGetDef::Builder builder = msg.initDdGet(); + builder.setClientID(mClientID); +} + +dragonError_t +DDGetMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDGetDef::Reader mReader = reader.getDdGet(); + + (*msg) = new DDGetMsg(reader.getTag(), mReader.getClientID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGet message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDGetMsg::clientID() +{ + return mClientID; +} + +/********************************************************************************************************/ +/* ddict get response message */ + +DDGetResponseMsg::DDGetResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDGetResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDGetResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDGetResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGetResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict pop message */ + +DDPopMsg::DDPopMsg(uint64_t tag, uint64_t clientID) : + DragonMsg(DDPopMsg::TC, tag), mClientID(clientID) {} + +void +DDPopMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDPopDef::Builder builder = msg.initDdPop(); + builder.setClientID(mClientID); +} + +dragonError_t +DDPopMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDPopDef::Reader mReader = reader.getDdPop(); + + (*msg) = new DDPopMsg(reader.getTag(), mReader.getClientID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGet message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDPopMsg::clientID() +{ + return mClientID; +} + +/********************************************************************************************************/ +/* ddict pop response message */ + +DDPopResponseMsg::DDPopResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDPopResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDPopResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDPopResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGetResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict contains message */ + +DDContainsMsg::DDContainsMsg(uint64_t tag, uint64_t clientID) : + DragonMsg(DDContainsMsg::TC, tag), mClientID(clientID) {} + +void +DDContainsMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDContainsDef::Builder builder = msg.initDdContains(); + builder.setClientID(mClientID); +} + +dragonError_t +DDContainsMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDContainsDef::Reader mReader = reader.getDdContains(); + + (*msg) = new DDContainsMsg(reader.getTag(), mReader.getClientID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGet message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDContainsMsg::clientID() +{ + return mClientID; +} + +/********************************************************************************************************/ +/* ddict contains response message */ + +DDContainsResponseMsg::DDContainsResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDContainsResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDContainsResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDContainsResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGetResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict get length message */ + +DDGetLengthMsg::DDGetLengthMsg(uint64_t tag, uint64_t clientID) : + DragonMsg(DDGetLengthMsg::TC, tag), mClientID(clientID) {} + +void +DDGetLengthMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDGetLengthDef::Builder builder = msg.initDdGetLength(); + builder.setClientID(mClientID); +} + +dragonError_t +DDGetLengthMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDGetLengthDef::Reader mReader = reader.getDdGetLength(); + + (*msg) = new DDGetLengthMsg(reader.getTag(), mReader.getClientID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGet message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDGetLengthMsg::clientID() +{ + return mClientID; +} + +/********************************************************************************************************/ +/* ddict get length response message */ + +DDGetLengthResponseMsg::DDGetLengthResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, uint64_t length): + DragonResponseMsg(DDGetLengthResponseMsg::TC, tag, ref, err, errInfo), mLength(length) {} + +void +DDGetLengthResponseMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDGetLengthResponseDef::Builder builder = msg.initDdGetLengthResponse(); + builder.setLength(mLength); +} + +dragonError_t +DDGetLengthResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + DDGetLengthResponseDef::Reader mReader = reader.getDdGetLengthResponse(); + + + (*msg) = new DDGetLengthResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr(), + mReader.getLength()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGetResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDGetLengthResponseMsg::length() +{ + return mLength; +} + +/********************************************************************************************************/ +/* ddict clear message */ + +DDClearMsg::DDClearMsg(uint64_t tag, uint64_t clientID) : + DragonMsg(DDClearMsg::TC, tag), mClientID(clientID) {} + +void +DDClearMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDClearDef::Builder builder = msg.initDdClear(); + builder.setClientID(mClientID); +} + +dragonError_t +DDClearMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDClearDef::Reader mReader = reader.getDdClear(); + + (*msg) = new DDClearMsg(reader.getTag(), mReader.getClientID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGet message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDClearMsg::clientID() +{ + return mClientID; +} + +/********************************************************************************************************/ +/* ddict clear response message */ + +DDClearResponseMsg::DDClearResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDClearResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDClearResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDClearResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGetResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ +/* ddict get iterator message */ + +DDGetIteratorMsg::DDGetIteratorMsg(uint64_t tag, uint64_t clientID) : + DragonMsg(DDGetIteratorMsg::TC, tag), mClientID(clientID) {} + +void +DDGetIteratorMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDGetIteratorDef::Builder builder = msg.initDdGetIterator(); + builder.setClientID(mClientID); +} + +dragonError_t +DDGetIteratorMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDGetIteratorDef::Reader mReader = reader.getDdGetIterator(); + + (*msg) = new DDGetIteratorMsg(reader.getTag(), mReader.getClientID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGet message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDGetIteratorMsg::clientID() +{ + return mClientID; +} + +/********************************************************************************************************/ +/* ddict get iterator response message */ + +DDGetIteratorResponseMsg::DDGetIteratorResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo, uint64_t iterID): + DragonResponseMsg(DDGetIteratorResponseMsg::TC, tag, ref, err, errInfo), mIterID(iterID) {} + +void +DDGetIteratorResponseMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDGetIteratorResponseDef::Builder builder = msg.initDdGetIteratorResponse(); + builder.setIterID(mIterID); +} + +dragonError_t +DDGetIteratorResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + DDGetIteratorResponseDef::Reader mReader = reader.getDdGetIteratorResponse(); + + + (*msg) = new DDGetIteratorResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr(), + mReader.getIterID()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGetResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDGetIteratorResponseMsg::iterID() +{ + return mIterID; +} + +/********************************************************************************************************/ +/* ddict iterator next message */ + +DDIteratorNextMsg::DDIteratorNextMsg(uint64_t tag, uint64_t clientID, uint64_t iterID) : + DragonMsg(DDIteratorNextMsg::TC, tag), mClientID(clientID), mIterID(iterID) {} + +void +DDIteratorNextMsg::builder(MessageDef::Builder& msg) +{ + DragonMsg::builder(msg); + DDIteratorNextDef::Builder builder = msg.initDdIteratorNext(); + builder.setClientID(mClientID); + builder.setIterID(mIterID); +} + +dragonError_t +DDIteratorNextMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + DDIteratorNextDef::Reader mReader = reader.getDdIteratorNext(); + + (*msg) = new DDIteratorNextMsg(reader.getTag(), mReader.getClientID(), mReader.getIterID()); + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGet message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +uint64_t +DDIteratorNextMsg::clientID() +{ + return mClientID; +} + +uint64_t +DDIteratorNextMsg::iterID() +{ + return mIterID; +} + +/********************************************************************************************************/ +/* ddict iterator next response message */ + +DDIteratorNextResponseMsg::DDIteratorNextResponseMsg(uint64_t tag, uint64_t ref, dragonError_t err, const char* errInfo): + DragonResponseMsg(DDIteratorNextResponseMsg::TC, tag, ref, err, errInfo) {} + + +dragonError_t +DDIteratorNextResponseMsg::deserialize(MessageDef::Reader& reader, DragonMsg** msg) +{ + try { + ResponseDef::Reader rReader = reader.getResponseOption().getValue(); + + (*msg) = new DDIteratorNextResponseMsg ( + reader.getTag(), + rReader.getRef(), + (dragonError_t)rReader.getErr(), + rReader.getErrInfo().cStr()); + + } catch (...) { + err_return(DRAGON_FAILURE, "There was an exception while deserializing the DDGetResponse message."); + } + + no_err_return(DRAGON_SUCCESS); +} + +/********************************************************************************************************/ + +typedef dragonError_t (*deserializeFun)(MessageDef::Reader& reader, DragonMsg** msg); + +static unordered_map deserializeFunctions +{ + {SH_CREATE_PROCESS_LOCAL_CHANNEL, &SHCreateProcessLocalChannel::deserialize}, + {SH_CREATE_PROCESS_LOCAL_CHANNEL_RESPONSE, &SHCreateProcessLocalChannelResponse::deserialize}, + {SH_SET_KV, &SHSetKVMsg::deserialize}, + {SH_SET_KV_RESPONSE, &SHSetKVResponseMsg::deserialize}, + {SH_GET_KV, &SHGetKVMsg::deserialize}, + {SH_GET_KV_RESPONSE, &SHGetKVResponseMsg::deserialize}, + {DD_REGISTER_CLIENT, &DDRegisterClientMsg::deserialize}, + {DD_DESTROY, &DDDestroyMsg::deserialize}, + {DD_DESTROY_RESPONSE, &DDDestroyResponseMsg::deserialize}, + {DD_DESTROY_MANAGER, &DDDestroyManagerMsg::deserialize}, + {DD_DESTROY_MANAGER_RESPONSE, &DDDestroyManagerResponseMsg::deserialize}, + {DD_REGISTER_MANAGER, &DDRegisterManagerMsg::deserialize}, + {DD_REGISTER_MANAGER_RESPONSE, &DDRegisterManagerResponseMsg::deserialize}, + {DD_REGISTER_CLIENT_ID, &DDRegisterClientIDMsg::deserialize}, + {DD_REGISTER_CLIENT_ID_RESPONSE, &DDRegisterClientIDResponseMsg::deserialize}, + {DD_PUT, &DDPutMsg::deserialize}, + {DD_PUT_RESPONSE, &DDPutResponseMsg::deserialize}, + {DD_GET, &DDGetMsg::deserialize}, + {DD_GET_RESPONSE, &DDGetResponseMsg::deserialize}, + {DD_POP, &DDPopMsg::deserialize}, + {DD_POP_RESPONSE, &DDPopResponseMsg::deserialize}, + {DD_CONTAINS, &DDContainsMsg::deserialize}, + {DD_CONTAINS_RESPONSE, &DDContainsResponseMsg::deserialize}, + {DD_GET_LENGTH, &DDGetLengthMsg::deserialize}, + {DD_GET_LENGTH_RESPONSE, &DDGetLengthResponseMsg::deserialize}, + {DD_CLEAR, &DDClearMsg::deserialize}, + {DD_CLEAR_RESPONSE, &DDClearResponseMsg::deserialize}, + {DD_GET_ITERATOR, &DDGetIteratorMsg::deserialize}, + {DD_GET_ITERATOR_RESPONSE, &DDGetIteratorResponseMsg::deserialize}, + {DD_ITERATOR_NEXT, &DDIteratorNextMsg::deserialize}, + {DD_ITERATOR_NEXT_RESPONSE, &DDIteratorNextResponseMsg::deserialize} +}; + +/* From here on down we should put this in an api.cpp and api.hpp to house Dragon API code. */ + +dragonError_t +recv_fli_msg(dragonFLIRecvHandleDescr_t* recvh, DragonMsg** msg, const timespec_t* timeout) +{ + int fd; + + try { + dragonError_t err; + err = dragon_fli_create_readable_fd(recvh, &fd, timeout); + if (err != DRAGON_SUCCESS) + err_return(err, "Could not create readable file descriptor to read message."); + + ::capnp::PackedFdMessageReader message(fd); + + close(fd); + + err = dragon_fli_finalize_readable_fd(recvh); + if (err != DRAGON_SUCCESS) + err_return(err, "Could not finalize readable file descriptor."); + + MessageDef::Reader reader = message.getRoot(); + MessageType tc = (MessageType)reader.getTc(); + + if (deserializeFunctions.count(tc) == 0) + err_return(DRAGON_INVALID_MESSAGE, dragon_msg_tc_name(tc)); + + err = (deserializeFunctions.at(tc))(reader, msg); + if (err != DRAGON_SUCCESS) + err_return(err, "Could not deserialize message."); + + } catch (...) { + err_return(DRAGON_INVALID_OPERATION, "There was an error while receiving the message from the fli."); + } + + no_err_return(DRAGON_SUCCESS); +} + +const char* dragon_msg_tc_name(uint64_t tc) +{ + auto tc_enum = static_cast(tc); + if (tcMap.count(tc_enum) == 0) { + std::stringstream err_str; + err_str << "Typecode " << tc << " is not a valid message type."; + return err_str.str().c_str(); + } + + return tcMap.at(tc_enum).c_str(); +} + +//#include "err.h" +char * dragon_getlasterrstr(); + +using namespace std; + +DragonError::DragonError(const dragonError_t err, const char* err_str): + err(err), err_str(err_str) +{} + +DragonError::~DragonError() {} + +dragonError_t DragonError::get_rc() const { + return err; +} + +const char* DragonError::get_err_str() const { + return err_str.c_str(); +} + + +/* This is used to support talking to the local services on the same node. The following + code provides a thread lock for multi-threaded support of communication the LS. */ + +static void* sh_return_lock_space = NULL; +static dragonLock_t sh_return_lock; +static bool sh_return_lock_initd = false; + +dragonError_t init_sh_return_lock() { + dragonError_t err; + + if (sh_return_lock_initd == false) { + sh_return_lock_space = malloc(dragon_lock_size(DRAGON_LOCK_FIFO_LITE)); + if (sh_return_lock_space == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for sh_return lock."); + + err = dragon_lock_init(&sh_return_lock, sh_return_lock_space, DRAGON_LOCK_FIFO_LITE); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize the threading sh_return_lock."); + sh_return_lock_initd = true; + } + + no_err_return(DRAGON_SUCCESS); +} + + +static dragonError_t +dragon_get_shep_return_cd(char** shep_return_cd) +{ + if (shep_return_cd == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The shep_return_cd argument cannot be NULL."); + + *shep_return_cd = getenv("DRAGON_SHEP_RET_CD"); + + if (*shep_return_cd == NULL) + err_return(DRAGON_INVALID_OPERATION, "The local shepherd return channel descriptor is not set in the environment."); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +dragon_get_shep_cd(char** shep_cd) +{ + if (shep_cd == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The shep_cd argument cannot be NULL."); + + *shep_cd = getenv("DRAGON_LOCAL_SHEP_CD"); + + if (*shep_cd == NULL) + err_return(DRAGON_INVALID_OPERATION, "The local shepherd channel descriptor is not set in the environment."); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t dragon_get_return_sh_fli(dragonFLIDescr_t* return_fli) +{ + dragonError_t err; + dragonChannelDescr_t shep_return_ch; + dragonChannelSerial_t shep_return_ser; + char* shep_ret_cd; + + if (return_fli == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The return_fli argument cannot be NULL."); + + err = dragon_get_shep_return_cd(&shep_ret_cd); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not do send/receive operation since Local Services return cd environment variable was not correctly set."); + + shep_return_ser.data = dragon_base64_decode(shep_ret_cd, &shep_return_ser.len); + + err = dragon_channel_attach(&shep_return_ser, &shep_return_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach to Local Services return channel."); + + err = dragon_channel_serial_free(&shep_return_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free the serialized channel structure."); + + err = dragon_fli_create(return_fli, &shep_return_ch, NULL, NULL, 0, NULL, true, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create return Local Services FLI."); + + no_err_return(DRAGON_SUCCESS); + +} + +dragonError_t +dragon_sh_send_receive(DragonMsg* req_msg, DragonMsg** resp_msg, dragonFLIDescr_t* return_fli, const timespec_t* timeout) +{ + dragonError_t err; + char* shep_cd; + dragonChannelSerial_t shep_ser; + dragonChannelDescr_t shep_ch; + dragonFLIDescr_t shep_fli; + dragonFLISendHandleDescr_t sendh; + dragonFLIRecvHandleDescr_t recvh; + /* The header is temporary while the local services still uses connection to receive bytes. */ + uint64_t header = 0xFFFFFFFFFFFFFF40; + uint64_t req_tag = req_msg->tag(); + bool have_resp = false; + + if (req_msg == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The req_msg argument cannot be NULL."); + + if (resp_msg == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The resp_msg argument cannot be NULL."); + + if (return_fli == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The return_fli argument cannot be NULL."); + + err = init_sh_return_lock(); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not do intialize the sh_return thread lock."); + + err = dragon_get_shep_cd(&shep_cd); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not do send/receive operation since Local Services cd environment variable was not correctly set."); + + shep_ser.data = dragon_base64_decode(shep_cd, &shep_ser.len); + + err = dragon_channel_attach(&shep_ser, &shep_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach to Local Services input channel."); + + err = dragon_channel_serial_free(&shep_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free the serialized channel structure."); + + err = dragon_fli_create(&shep_fli, &shep_ch, NULL, NULL, 0, NULL, true, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create main Local Services FLI."); + + err = dragon_lock(&sh_return_lock); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not lock the sh_return channel"); + + err = dragon_fli_open_send_handle(&shep_fli, &sendh, NULL, timeout); + if (err != DRAGON_SUCCESS) { + dragon_unlock(&sh_return_lock); + append_err_return(err, "Could not open send handle."); + } + + /* The following is sent temporarily while the local services still uses the old + connection.recv code to receive messages. Since unpickle is looking for header + data, we send the following 8 byte header that tells unpickle to receive it + as bytes. A couple other minor modifications in the Peer2PeerReadingChannelFile and + in connection.py were needed as well to allow the bytes data to pass through. The + other modifications allow a size greater than the number of bytes to be passed in + the header since the size is not known before it is written. The other change, in + connection.py, allows a bytes object to be returned when it can't be unpickled. */ + + err = dragon_fli_send_bytes(&sendh, sizeof(header), (uint8_t*)&header, 0, true, timeout); + if (err != DRAGON_SUCCESS) { + dragon_unlock(&sh_return_lock); + append_err_return(err, "Could not send header."); + } + + err = req_msg->send(&sendh, timeout); + if (err != DRAGON_SUCCESS) { + dragon_unlock(&sh_return_lock); + append_err_return(err, "Could not send DragonMsg."); + } + + err = dragon_fli_close_send_handle(&sendh, timeout); + if (err != DRAGON_SUCCESS) { + dragon_unlock(&sh_return_lock); + append_err_return(err, "Could not close send handle."); + } + + err = dragon_fli_open_recv_handle(return_fli, &recvh, NULL, timeout); + if (err != DRAGON_SUCCESS) { + dragon_unlock(&sh_return_lock); + append_err_return(err, "Could not open receive handle."); + } + + /* This while loop is here out of an abundance of caution in case + a previous request timed out and then later returned a response + to the channel. If that happened, we may need to throw away some + messages. */ + while (!have_resp) { + err = recv_fli_msg(&recvh, resp_msg, timeout); + if (err != DRAGON_SUCCESS) { + dragon_unlock(&sh_return_lock); + append_err_return(err, "Could not open receive response message."); + } + + DragonResponseMsg* resp = static_cast(*resp_msg); + + if (resp->ref() == req_tag) + have_resp = true; + else /* toss it */ + delete resp; + } + + err = dragon_unlock(&sh_return_lock); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not unlock the sh_return channel."); + + err = dragon_fli_close_recv_handle(&recvh, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close receive handle."); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_create_process_local_channel(dragonChannelDescr_t* ch, const timespec_t* timeout) +{ + dragonError_t err; + char* ser_fli; + char *end; + const char* puid_str; + dragonFLIDescr_t return_fli; + dragonFLISerial_t return_fli_ser; + DragonMsg* resp_msg; + SHCreateProcessLocalChannelResponse* resp; + dragonChannelSerial_t ch_ser; + + if (ch == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The ch argument cannot be NULL."); + + err = dragon_get_return_sh_fli(&return_fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get the Local Services return channel."); + + err = dragon_fli_serialize(&return_fli, &return_fli_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize the return fli"); + + ser_fli = dragon_base64_encode(return_fli_ser.data, return_fli_ser.len); + + err = dragon_fli_serial_free(&return_fli_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free the serialized fli structure."); + + puid_str = getenv("DRAGON_MY_PUID"); + if (puid_str == NULL) + err_return(DRAGON_INVALID_OPERATION, "The DRAGON_MY_PUID environment variable was not set."); + + const long puid = strtol(puid_str, &end, 10); + + SHCreateProcessLocalChannel msg(inc_sh_tag(), puid, ser_fli); + + err = dragon_sh_send_receive(&msg, &resp_msg, &return_fli, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not complete send/receive operation."); + + if (resp_msg->tc() != SHCreateProcessLocalChannelResponse::TC) + err_return(err, "Expected an SHCreateProcessLocalChannelResponse and did not get it."); + + resp = static_cast(resp_msg); + + if (resp->err() != DRAGON_SUCCESS) + err_return(resp->err(), resp->errInfo()); + + const char* ser_chan = resp->serChannel(); + + ch_ser.data = dragon_base64_decode(ser_chan, &ch_ser.len); + + err = dragon_channel_attach(&ch_ser, ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach to process local channel."); + + delete resp; + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_ls_set_kv(const unsigned char* key, const unsigned char* value, const timespec_t* timeout) +{ + dragonError_t err; + char* ser_fli; + dragonFLIDescr_t return_fli; + dragonFLISerial_t return_fli_ser; + DragonMsg* resp_msg; + SHSetKVResponseMsg* resp; + + if (key == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The key argument cannot be NULL."); + + if (value == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The value argument cannot be NULL."); + + err = dragon_get_return_sh_fli(&return_fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get the Local Services return channel."); + + err = dragon_fli_serialize(&return_fli, &return_fli_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize the return fli"); + + ser_fli = dragon_base64_encode(return_fli_ser.data, return_fli_ser.len); + + err = dragon_fli_serial_free(&return_fli_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free the serialized fli structure."); + + SHSetKVMsg msg(inc_sh_tag(), (char*)key, (char*)value, ser_fli); + + err = dragon_sh_send_receive(&msg, &resp_msg, &return_fli, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not complete send/receive operation."); + + if (resp_msg->tc() != SHSetKVResponseMsg::TC) + err_return(err, "Expected an SHSetKVResponse and did not get it."); + + resp = static_cast(resp_msg); + + if (resp->err() != DRAGON_SUCCESS) + err_return(resp->err(), resp->errInfo()); + + delete resp; + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_ls_get_kv(const unsigned char* key, char** value, const timespec_t* timeout) +{ + dragonError_t err; + char* ser_fli; + dragonFLIDescr_t return_fli; + dragonFLISerial_t return_fli_ser; + DragonMsg* resp_msg; + SHGetKVResponseMsg* resp; + + if (key == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The key argument cannot be NULL."); + + if (value == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The value argument cannot be NULL."); + + err = dragon_get_return_sh_fli(&return_fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get the Local Services return channel."); + + err = dragon_fli_serialize(&return_fli, &return_fli_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize the return fli"); + + ser_fli = dragon_base64_encode(return_fli_ser.data, return_fli_ser.len); + + err = dragon_fli_serial_free(&return_fli_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free the serialized fli structure."); + + SHGetKVMsg msg(inc_sh_tag(), (char*)key, ser_fli); + + err = dragon_sh_send_receive(&msg, &resp_msg, &return_fli, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not complete send/receive operation."); + + if (resp_msg->tc() != SHGetKVResponseMsg::TC) + err_return(err, "Expected an SHSetKVResponse and did not get it."); + + resp = static_cast(resp_msg); + + if (resp->err() != DRAGON_SUCCESS) + err_return(resp->err(), resp->errInfo()); + + const char* source = resp->value(); + + *value = (char*)malloc(strlen(source)+1); + + if (*value == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for value."); + + strcpy(*value, source); + + delete resp; + + no_err_return(DRAGON_SUCCESS); +} \ No newline at end of file diff --git a/src/lib/pals.c b/src/lib/pals.c index d7293ef..aa9d5cc 100644 --- a/src/lib/pals.c +++ b/src/lib/pals.c @@ -9,7 +9,6 @@ extern dragonRecvJobParams_t pmod_mparams; static void *lib_pals_handle = NULL; -static int ptrs_set = 0; static int inside_vanilla_pals = 0; @@ -34,9 +33,12 @@ pals_rc_t (*fn_pals_app_spawn)( const char *preput_envs[], const int num_envs, pals_rc_t errors[]); const char *(*fn_pals_errmsg)(pals_state_t *state); -void set_pals_function_pointers() +pals_rc_t set_pals_function_pointers() { - lib_pals_handle = dlopen("/opt/cray/pe/pals/default/lib/libpals.so", RTLD_LAZY | RTLD_GLOBAL); + lib_pals_handle = dlopen("libpals.so", RTLD_LAZY | RTLD_GLOBAL); + if (lib_pals_handle == NULL) + return PALS_FAILED; + fn_pals_init = dlsym(lib_pals_handle, "pals_init"); fn_pals_init2 = dlsym(lib_pals_handle, "pals_init2"); @@ -55,6 +57,8 @@ void set_pals_function_pointers() fn_pals_get_apid = dlsym(lib_pals_handle, "pals_get_apid"); fn_pals_app_spawn = dlsym(lib_pals_handle, "pals_app_spawn"); fn_pals_errmsg = dlsym(lib_pals_handle, "pals_errmsg"); + + return PALS_OK; } int get_pals_context() { @@ -86,7 +90,9 @@ pals_rc_t pals_init(pals_state_t *state) // the values PALS knows to be true. Thus, we need to make sure any // PALS functions we wrap know to send back unmodified return values, // ie: only use the results from direct calls to our PALS function pointers. - set_pals_function_pointers(); + if (set_pals_function_pointers() != PALS_OK) + return PALS_FAILED; + set_pals_context(); pals_rc_t err = fn_pals_init(state); @@ -97,7 +103,8 @@ pals_rc_t pals_init(pals_state_t *state) //// TODO: pals_init2 will always be defined, so how can PMI check if it's NULL? pals_rc_t pals_init2(pals_state_t **state) { - set_pals_function_pointers(); + if (set_pals_function_pointers() != PALS_OK) + return PALS_FAILED; set_pals_context(); // no error checking, just pass rc through to caller diff --git a/src/lib/pmod_recv_mpi.c b/src/lib/pmod_recv_mpi.c index ca7f9b6..07bbc8e 100644 --- a/src/lib/pmod_recv_mpi.c +++ b/src/lib/pmod_recv_mpi.c @@ -156,7 +156,7 @@ attach_to_parent_channel(dragonChannelDescr_t *parent_ch, dragonChannelSerial_t parent_ch_ser; - parent_ch_ser.data = dragon_base64_decode(tmp, strlen(tmp), &parent_ch_ser.len); + parent_ch_ser.data = dragon_base64_decode(tmp, &parent_ch_ser.len); dragonError_t err = DRAGON_SUCCESS; diff --git a/src/lib/shared_lock.c b/src/lib/shared_lock.c index 7c2b59e..c283d09 100644 --- a/src/lib/shared_lock.c +++ b/src/lib/shared_lock.c @@ -434,6 +434,7 @@ dragon_fifo_lock_attach(dragonFIFOLock_t * dlock, void * ptr) { if (dlock == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); + if (ptr == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); @@ -1056,4 +1057,4 @@ dragon_greedy_lock_is_valid(dragonGreedyLock_t* dlock) return false; return *dlock->initd == LOCK_INITD; -} +} \ No newline at end of file diff --git a/src/lib/shared_lock.hpp b/src/lib/shared_lock.hpp new file mode 100644 index 0000000..f03272d --- /dev/null +++ b/src/lib/shared_lock.hpp @@ -0,0 +1,51 @@ +#ifndef HAVE_DRAGON_LOCK_HPP +#define HAVE_DRAGON_LOCK_HPP + +#include "stdlib.h" +#include "shared_lock.h" + +#include + +class dragonLock +{ +private: + + dragonLock_t dlock; + void *mem; + +public: + + dragonLock() + { + auto lock_size = dragon_lock_size(DRAGON_LOCK_FIFO_LITE); + this->mem = calloc(lock_size, 1ul); + if (this->mem == nullptr) { + throw std::runtime_error("failed to allocate memory for lock"); + } + + auto dragon_rc = dragon_lock_init(&this->dlock, mem, DRAGON_LOCK_FIFO_LITE); + if (dragon_rc != DRAGON_SUCCESS) { + throw std::runtime_error("failed to initialize lock"); + } + } + + ~dragonLock() + { + dragon_lock_destroy(&this->dlock); + free(this->mem); + } + + dragonError_t + acquire() + { + return dragon_lock(&this->dlock); + } + + dragonError_t + release() + { + return dragon_unlock(&this->dlock); + } +}; + +#endif diff --git a/src/lib/umap.cpp b/src/lib/umap.cpp index 824d412..e3a8e83 100644 --- a/src/lib/umap.cpp +++ b/src/lib/umap.cpp @@ -3,6 +3,7 @@ #include "umap.h" #include "err.h" #include +#include using namespace std; @@ -49,12 +50,17 @@ class dragonMap ~dragonMap() { dMap.clear(); + dMap_multikey.clear(); } void addItem(uint64_t key, const void * data) { dMap[key] = data; } + void addItem_multikey(std::string keys, const void * data) { + dMap_multikey[keys] = data; + } + const void * getItem(uint64_t key) { try { @@ -66,10 +72,25 @@ class dragonMap } } + const void * getItem_multikey(std::string keys) { + try + { + return dMap_multikey.at(keys); + } + catch (const out_of_range& oor) + { + return NULL; + } + } + void delItem(uint64_t key) { dMap.erase(key); } + void delItem_multikey(std::string keys) { + dMap_multikey.erase(keys); + } + uint64_t new_key() { do { lkey = hash(lkey); @@ -80,6 +101,7 @@ class dragonMap private: unordered_map dMap; + unordered_map dMap_multikey; uint64_t lkey; /* this is hash function based on splitmix64 from @@ -156,6 +178,23 @@ dragon_umap_additem(dragonMap_t * dmap, const uint64_t key, const void * data) no_err_return(DRAGON_SUCCESS); } +/* TODO: Pass in an array of keys? */ +dragonError_t +dragon_umap_additem_multikey(dragonMap_t * dmap, const uint64_t key0, const uint64_t key1, const void * data) +{ + if (dmap == NULL) + err_return(DRAGON_INVALID_ARGUMENT,"The dmap handle is NULL. Cannot add item."); + + dragonMap * cpp_map; + cpp_map = static_cast(dmap->_map); + + __lock_map(dmap); + cpp_map->addItem_multikey(std::to_string(key0) + std::to_string(key1), data); + __unlock_map(dmap); + + no_err_return(DRAGON_SUCCESS); +} + dragonError_t dragon_umap_additem_genkey(dragonMap_t * dmap, const void * data, uint64_t * new_key) { @@ -173,7 +212,6 @@ dragon_umap_additem_genkey(dragonMap_t * dmap, const void * data, uint64_t * new no_err_return(DRAGON_SUCCESS); } - dragonError_t dragon_umap_getitem(dragonMap_t * dmap, const uint64_t key, void ** data) { @@ -194,6 +232,26 @@ dragon_umap_getitem(dragonMap_t * dmap, const uint64_t key, void ** data) no_err_return(DRAGON_SUCCESS); } +dragonError_t +dragon_umap_getitem_multikey(dragonMap_t * dmap, const uint64_t key0, const uint64_t key1, void ** data) +{ + if (dmap == NULL || data == NULL) + err_return(DRAGON_INVALID_ARGUMENT,"The dmap handle is NULL. Cannot get an item from it."); + + dragonMap * cpp_map; + cpp_map = static_cast(dmap->_map); + + __lock_map(dmap); + *data = (void*)cpp_map->getItem_multikey(std::to_string(key0) + std::to_string(key1)); + __unlock_map(dmap); + + if (*data == NULL) { + err_return(DRAGON_MAP_KEY_NOT_FOUND,"The dmap item is not found."); + } + + no_err_return(DRAGON_SUCCESS); +} + dragonError_t dragon_umap_delitem(dragonMap_t * dmap, const uint64_t key) { @@ -209,3 +267,19 @@ dragon_umap_delitem(dragonMap_t * dmap, const uint64_t key) no_err_return(DRAGON_SUCCESS); } + +dragonError_t +dragon_umap_delitem_multikey(dragonMap_t * dmap, const uint64_t key0, const uint64_t key1) +{ + if (dmap == NULL) + err_return(DRAGON_INVALID_ARGUMENT,"The dmap handle is NULL. Cannot delete the key/value pair."); + + dragonMap * cpp_map; + cpp_map = static_cast(dmap->_map); + + __lock_map(dmap); + cpp_map->delItem_multikey(std::to_string(key0) + std::to_string(key1)); + __unlock_map(dmap); + + no_err_return(DRAGON_SUCCESS); +} diff --git a/src/lib/umap.h b/src/lib/umap.h index bf4fb0a..f5ed75d 100644 --- a/src/lib/umap.h +++ b/src/lib/umap.h @@ -24,15 +24,23 @@ dragon_umap_destroy(dragonMap_t * dmap); dragonError_t dragon_umap_additem(dragonMap_t * dmap, const uint64_t key, const void * data); +dragonError_t +dragon_umap_additem_multikey(dragonMap_t * dmap, const uint64_t key0, const uint64_t key1, const void * data); + dragonError_t dragon_umap_additem_genkey(dragonMap_t * dmap, const void * data, uint64_t * new_key); dragonError_t dragon_umap_getitem(dragonMap_t * dmap, const uint64_t key, void ** data); +dragonError_t +dragon_umap_getitem_multikey(dragonMap_t * dmap, const uint64_t key0, const uint64_t key1, void ** data); + dragonError_t dragon_umap_delitem(dragonMap_t * dmap, const uint64_t key); +dragonError_t +dragon_umap_delitem_multikey(dragonMap_t * dmap, const uint64_t key0, const uint64_t key1); #ifdef __cplusplus } diff --git a/src/lib/utils.c b/src/lib/utils.c index d4630b7..a198d81 100644 --- a/src/lib/utils.c +++ b/src/lib/utils.c @@ -3,6 +3,7 @@ #include #include "hostid.h" #include "err.h" +#include #include #include #include @@ -225,6 +226,26 @@ dragon_set_host_id(dragonULInt id) no_err_return(DRAGON_SUCCESS); } +/* get the front end's external IP address, along with the IP address + * for the head node, which are used to identify this Dragon runtime */ +dragonULInt +dragon_get_local_rt_uid() +{ + static dragonULInt rt_uid = 0UL; + + if (rt_uid == 0UL) { + char *rt_uid_str = getenv("DRAGON_RT_UID"); + + /* Return 0 to indicate failure */ + if (rt_uid_str == NULL) + return 0UL; + + rt_uid = (dragonULInt) strtoul(rt_uid_str, NULL, 10); + } + + return rt_uid; +} + dragonError_t dragon_set_procname(char * name) { @@ -462,6 +483,14 @@ dragon_timespec_remaining(const timespec_t * deadline, timespec_t * remaining_ti no_err_return(DRAGON_SUCCESS); } +void strip_newlines(const char* inout_str, size_t* input_length) { + size_t idx = *input_length-1; + + while (inout_str[idx] == '\n') + idx--; + + *input_length = idx+1; +} static const char encoding_table[] = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', @@ -492,13 +521,14 @@ static const unsigned char decoding_table[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; char* -dragon_base64_encode(uint8_t *data, size_t input_length, size_t *output_length) { +dragon_base64_encode(uint8_t *data, size_t input_length) +{ const int mod_table[] = { 0, 2, 1 }; - *output_length = 4 * ((input_length + 2) / 3); + size_t output_length = 4 * ((input_length + 2) / 3); - char *encoded_data = (char*)malloc(1 + *output_length); + char *encoded_data = (char*)malloc(1 + output_length); if (encoded_data == NULL) return NULL; @@ -518,19 +548,24 @@ dragon_base64_encode(uint8_t *data, size_t input_length, size_t *output_length) } for (int i = 0; i < mod_table[input_length % 3]; i++) - encoded_data[*output_length - 1 - i] = '='; + encoded_data[output_length - 1 - i] = '='; - encoded_data[*output_length] = '\0'; + encoded_data[output_length] = '\0'; return encoded_data; } uint8_t* -dragon_base64_decode(const char *data, size_t input_length, size_t *output_length) { +dragon_base64_decode(const char *data, size_t *output_length) +{ + size_t input_length = strlen(data); + + strip_newlines(data, &input_length); if (input_length % 4 != 0) return NULL; + *output_length = input_length / 4 * 3; if (data[input_length - 1] == '=') (*output_length)--; @@ -573,3 +608,77 @@ dragon_hash_ulint(dragonULInt x) z = (z ^ (z >> 27)) * 0x94d049bb133111eb; return z ^ (z >> 31); } + +dragonULInt +dragon_hash(void* ptr, size_t num_bytes) +{ + if (num_bytes == 0) + return 0; + + if (ptr == NULL) + return 0; + + dragonULInt alignment = sizeof(dragonULInt) - (dragonULInt)ptr % sizeof(dragonULInt); + if (alignment == sizeof(dragonULInt)) + alignment = 0; + + long num_words = (num_bytes-alignment)/sizeof(dragonULInt); + + long rem = (num_bytes-alignment)%sizeof(dragonULInt); + + uint8_t* first_bytes = (uint8_t*) ptr; + dragonULInt* arr = (dragonULInt*) ptr + alignment; + uint8_t* last_bytes = (uint8_t*)&arr[num_words]; + + dragonULInt hashVal = 0; + + long i; + for (i=0;iLS Group Creation of processes #641 by eric-cozzi was merged Apr 11, 2024 + +Overhaul of ProcessGroup for improved scalability and reliability [#653, #647 #625] by nicholas-hill was merged Apr 18, 2024; Apr 15, 2024; Mar 25, 2024 + + +### Added + +Add signal handling for clean up when workload manager experiences allocation timeout #650 by wahlc was merged Apr 18, 2024 + +New Distributed Dictionary and Checkpointing Design [#644, #629] by kent-lee was merged Apr 18, 2024, Apr 3, 2024 + +Update Release Tests #639 by veena-venkata-ghorakavi was merged Apr 5, 2024 + +On node gpu framework and init #633 by nick-radcliffe was merged Apr 9, 2024 + +Add hints to transport payloads #630 by nick-radcliffe was merged Mar 27, 2024 + +Expose policy for process placement #628 by wahlc was merged Mar 26, 2024 + +### Removed + +Limit docs update to one build of master #623 by mohammad-hadi was merged Mar 15, 2024 + +### Fixed + +Fix bad bcast tree init regression brought out by resiliency changes. #645 by nicholas-hill was merged Apr 15, 2024 + +Fix bad frontend host ID assignment #652 by nicholas-hill was merged Apr 17, 2024 + +Ensure clean build script does a thorough clean and build #643 by kent-lee was merged Apr 9, 2024 + +Fix regression and add test for launching pool of pools #638 by wahlc was merged Apr 12, 2024 + +Fix capnproto build and install #631 by kent-lee was closed Mar 21, 2024 + +Remove HSTA from bring-up if there's only one backend node #632 by nick-radcliffe was merged Mar 25, 2024 + +## [0.8] - 2024-02-26 -## [0.8] - 2024-02-26 ### Changed -### Added +### Added Libfabric support for hsta and multi-NIC support [#620, #614, #594] by nick-radcliffe was merged Mar 7, 2024, Feb 22, 2024, Feb 15, 2024 diff --git a/src/pkg/INSTALL.md b/src/pkg/INSTALL.md index 120eb12..246fccd 100644 --- a/src/pkg/INSTALL.md +++ b/src/pkg/INSTALL.md @@ -4,40 +4,33 @@ shared objects are separated out from the Dragon Python wheel file. This is to Dragon runtime environment from other languages, such as Fortran/C/C++. Before you can run programs using Dragon, you must set up the run-time for your -environment. You must have Python 3.9, 3.10, or 3.11 installed and it must be in your path -somewhere. A common choice is to use a Python virtual environment, which can be initialized -from a base Python 3.9+ with: +environment. You must have Python 3.9, 3.10, or 3.11 installed and it must correspond to the version of the Dragon package that was downloaded. A common choice for running Python programs is to use a Python virtual +environment. An install script is supplied in the distribution that performs the +install step(s) for you and creates and activates a virtual environment. You will +find this install script in the untarred distribution file at the root level. + + ./dragon-install - python3 -m venv --clear _env - . _env/bin/activate +You have completed the prerequisites for running Dragon with multiprocessing programs. If you are not in the virtual environment, you may need to navigate to the untarred distribution file at the root level and follow the commands below for activating the virtual environment. +If you have already installed and want to come back and use your install at a later +time you may have to reactivate your environment. Execute this from the same directory as the install was run from above. -The untarred distribution file contains several subdirectories. All provided commands -are relative to the directory that contains the README.md. + . _env/bin/activate -The `dragon-*.whl` file must be pip3 installed once for your environment. +Along with reactivating your environment you will also need to load the dragon +module. - pip3 install --force-reinstall dragon-0.8-*.whl + module use $PWD/modulefiles + module load dragon +If you are NOT using a virtual environment then check and possibly update the +`$PATH` so it has the location of pip installed console scripts, such as +~/.local/bin. If using a virtual environment, this step is not necessary. -Check and possibly update that `$PATH` is has the location of pip installed -console scripts, such as ~/.local/bin if you're not using a virtual environment. + export PATH=~/.local/bin:${PATH} - export PATH=~/.local/bin:${PATH} - - -Set up the path to the Dragon module - - module use [/absolute path to directory with this INSTALL.md file]/modulefiles - - -Load the Dragon module - - module load dragon - - -Test your configuration using the point-to-point latency test. You should see output similar to below after the -`dragon` command. +You can test your configuration using the point-to-point latency test. You should see output similar to below after the `dragon` command for a single node allocation. ``` cd examples/multiprocessing diff --git a/src/pkg/README.md b/src/pkg/README.md index 02ec8de..f6fdc7d 100644 --- a/src/pkg/README.md +++ b/src/pkg/README.md @@ -23,10 +23,13 @@ The untarred distribution file contains several subdirectories directories including the following. All provided commands are relative to the directory that contains this README.md. +* The pycapnp-*.whl file must be pip3 installed once for your environment. + + pip3 install --force-reinstall pycapnp-*.whl * The dragon-*.whl file must be pip3 installed once for your environment. - pip3 install --force-reinstall dragon-0.8-*.whl + pip3 install --force-reinstall dragon-0.9-*.whl * Check and possibly update that `$PATH` has the location of pip installed console scripts, such as ~/.local/bin @@ -36,7 +39,7 @@ that contains this README.md. * modulefiles - This contains module files that are needed when using Dragon. You must set up the environment by loading the dragon module as follows. - module use [/path to dragon-0.8]/modulefiles + module use [/path to dragon-0.9]/modulefiles module load dragon If you intend to use Dragon on your own Linux VM or an image that you @@ -56,7 +59,7 @@ that contains this README.md. itself. Also under this directory are the standard Python multiprocessing unit tests packaged for easier use with Dragon (these have been developed for Python 3.9 but are still in progress for Python 3.10 and 3.11.) There is a - README.md in the `examples` directory with more information about these + README.md in the `examples` directory with more information about these demonstration programs. * dragon_unittests - This directory contains a selection of Dragon-specific unit @@ -73,7 +76,7 @@ that contains this README.md. of Dragon. After doing the `pip3 install` and the -`module use [/path to dragon-0.8]/modulefiles && module load dragon` you have +`module use [/path to dragon-0.9]/modulefiles && module load dragon` you have completed the prerequisites for running Dragon multiprocessing programs. Running a Program using Dragon diff --git a/src/pkg/RELEASE_NOTES.md b/src/pkg/RELEASE_NOTES.md index 1113d95..cfeea68 100644 --- a/src/pkg/RELEASE_NOTES.md +++ b/src/pkg/RELEASE_NOTES.md @@ -1,3 +1,13 @@ +# Dragon 0.9 Release Summary +This release augments scalability and performance for launching 10k or more processes and greatly improves distributed dictionary +performanace. Other highlighted features: + +- Improvements to ProcessGroup to provide better user experience and performance +- Improve launch time for large numbers of processes by enabling batch launch +- New implementation for distributed dictionary that improves performances and scalability +- Support for placement of processes via Policy API +- Bug fix for launching a Pool of pools + # Dragon 0.8 Release Summary This package introduces new features that enhance portability, further optimize performance at scale, and increase usability with packages that rely on Python multiprocessing derivatives. Highlighted new features are: diff --git a/src/setup.py b/src/setup.py index b20fcfb..0b16124 100644 --- a/src/setup.py +++ b/src/setup.py @@ -78,6 +78,13 @@ def run(self): if not lib_tempdir.is_symlink(): raise + message_defs_file = rootdir / 'dragon' / 'infrastructure' / 'message_defs.capnp' + try: + message_defs_file.symlink_to(rootdir / 'lib' / 'message_defs.capnp') + except: + if not message_defs_file.is_symlink(): + raise + _cythonize = partial(cythonize, nthreads=int(os.environ.get('DRAGON_BUILD_NTHREADS', os.cpu_count())), show_all_warnings=True, @@ -192,11 +199,10 @@ def run(self): description="Python multiprocessing over the Dragon distributed runtime", packages=find_packages(), package_data={ - 'dragon': ['lib/libdragon.so', 'lib/libpmod.so', 'lib/libpmsgqueue.so',] + 'dragon': ['lib/libdragon.so', 'lib/libpmod.so', 'lib/libpmsgqueue.so'] }, ext_modules = extensions, entry_points=entry_points, python_requires=">=3.9", - install_requires=['cloudpickle', 'numpy', ], - + install_requires=['cloudpickle', 'numpy'] ) diff --git a/src/tools/dragon-env b/src/tools/dragon-env new file mode 100755 index 0000000..3876f1f --- /dev/null +++ b/src/tools/dragon-env @@ -0,0 +1,218 @@ +#!/usr/bin/python3 + +import argparse +import json +import os +import subprocess +import sys + +from pathlib import Path + +def remove_suffix(string_w_suffix, suffix): + if suffix and string_w_suffix.endswith(suffix): + return string_w_suffix[:-len(suffix)] + else: + return string_w_suffix + +# get args + +parser = argparse.ArgumentParser( + prog='dragon-env', + description='Configure the build and runtime environments for Dragon in regards to 3rd party libraries. This is currently only needed for building with GPU support, but the number of uses will increase in future releases, including runtime configuration for libraries.' +) + +add_help = 'Add a colon-separated list of key-value pairs (key=value) to configure include and library paths for Dragon. ' +add_help += 'Possible keys: cuda-include, hip-include, ze-include' + +parser.add_argument( + '-a', '--add', + help='Add a colon-separated list of key-value pairs (key=value) to configure include and library paths for Dragon. Possible keys: cuda-include, hip-include, ze-include.' +) +parser.add_argument( + '-c', '--clear', + help='Clear all key-value pairs.', + action='store_true' +) +parser.add_argument( + '-p', '--pity', + help='Take pity on the new user. Specifically, this gives a best-effort string that can be supplied to the --add command.', + action='store_true' +) +parser.add_argument( + '-s', '--serialize', + help='Serialize all key-value pairs currently in the configuration file into a single, colon-separated string that can be passed to the --add command.', + action='store_true' +) + +args = parser.parse_args() + +# set base dir and other paths + +try: + base_dir = os.environ['DRAGON_BASE_DIR'] +except: + base_dir = "" + +try: + home_dir = str(Path.home()) + dragon_dir = f'{home_dir}/.dragon' + + if os.path.exists(home_dir) and not os.path.exists(dragon_dir): + os.makedirs(dragon_dir) + + env_path = f'{dragon_dir}/dragon-env.json' + makefile_path = f'{base_dir}/.dragon-config.mk' +except: + env_path = "" + makefile_path = "" + +# handle pity command +# TODO: only handling GPU vendor headers at this point, but we should +# also handle libfabric, PMI, MPI, etc. headers this way + +if args.pity: + pity_str = "" + the_first_one = True + + try: + suffix = 'cuda_runtime.h' + + cuda_include = subprocess.check_output( + f'find /opt /usr -name cuda_runtime.h | grep -m 1 "{suffix}"', + stderr=subprocess.PIPE, + shell=True, + encoding='utf-8' + ).strip() + + cuda_include = remove_suffix(cuda_include, suffix) + + if the_first_one: + pity_str += f'cuda-include={cuda_include}' + the_first_one = False + else: + pity_str += f':cuda-include={cuda_include}' + except: + pass + + try: + suffix = 'hip/hip_runtime.h' + + hip_include = subprocess.check_output( + f'find /opt /usr -name hip_runtime.h | grep -m 1 "{suffix}"', + stderr=subprocess.PIPE, + shell=True, + encoding='utf-8' + ).strip() + + hip_include = remove_suffix(hip_include, suffix) + + if the_first_one: + pity_str += f'hip-include={hip_include}' + the_first_one = False + else: + pity_str += f':hip-include={hip_include}' + except: + pass + + try: + suffix = "ze_api.h" + + ze_include = subprocess.check_output( + f'find /opt /usr -name ze_api.h | grep -m 1 "{suffix}"', + stderr=subprocess.PIPE, + shell=True, + encoding='utf-8' + ).strip() + + ze_include = remove_suffix(ze_include, suffix) + + if the_first_one: + pity_str += f'ze-include={ze_include}' + the_first_one = False + else: + pity_str += f':ze-include={ze_include}' + except: + pass + + print(pity_str, flush=True) + +# handle serialize command before updating anything + +if args.serialize: + if env_path == "": + print('failed to serialize environment: unable to find environment file', flush=True) + sys.exit() + + if os.path.isfile(env_path): + with open(env_path) as env_file: + env_dict = json.load(env_file) + + ser_config = '' + the_first_one = True + for key in env_dict: + if the_first_one: + ser_config += f'{key}={env_dict[key]}' + the_first_one = False + else: + ser_config += f':{key}={env_dict[key]}' + print(ser_config, flush=True) + else: + print('no environment configuration available', flush=True) + +# handle 'clear' command (do this first, so clear+set acts as a reset) + +if args.clear: + if env_path == "" or makefile_path == "": + print('failed to clear environment: unable to find environment file(s)', flush=True) + sys.exit() + + try: + os.remove(env_path) + os.remove(makefile_path) + except: + pass + +# handle 'add' command + +if args.add is not None: + if base_dir == "": + print('failed to update environment: DRAGON_BASE_DIR not set, try hack/setup', flush=True) + sys.exit() + + if env_path == "" or makefile_path == "": + print('failed to update environment: unable to find environment file(s)', flush=True) + + if os.path.isfile(env_path): + with open(env_path) as env_file: + env_dict = json.load(env_file) + else: + env_dict = {} + + user_input = args.add.split(':') + new_env = dict(kv.split('=', 1) for kv in user_input) + env_dict.update(new_env) + + with open(env_path, 'w') as env_file: + json.dump(env_dict, env_file) + + with open(makefile_path, 'w') as make_file: + for key in env_dict: + if 'include' in key: + path = env_dict[key] + make_file.write(f'CONFIG_INCLUDE := $(CONFIG_INCLUDE) -I{path}\n') + + if 'cuda' in key: + path = env_dict[key] + make_file.write(f'CONFIG_SOURCES := $(CONFIG_SOURCES) {base_dir}/lib/gpu/cuda.cpp\n') + make_file.write(f'CONFIG_DEFINES := $(CONFIG_DEFINES) -DHAVE_CUDA_INCLUDE\n') + + if 'hip' in key: + path = env_dict[key] + make_file.write(f'CONFIG_SOURCES := $(CONFIG_SOURCES) {base_dir}/lib/gpu/hip.cpp\n') + make_file.write(f'CONFIG_DEFINES := $(CONFIG_DEFINES) -DHAVE_HIP_INCLUDE\n') + + if 'ze' in key: + path = env_dict[key] + make_file.write(f'CONFIG_SOURCES := $(CONFIG_SOURCES) {base_dir}/lib/gpu/ze.cpp\n') + make_file.write(f'CONFIG_DEFINES := $(CONFIG_DEFINES) -DHAVE_ZE_INCLUDE\n') + diff --git a/test/Makefile b/test/Makefile index ecd693a..d234101 100644 --- a/test/Makefile +++ b/test/Makefile @@ -30,7 +30,9 @@ DIST_SOURCES := $(wildcard pkg/*) \ multi-node/test_array.py \ multi-node/test_barrier.py \ multi-node/test_connection.py \ - multi-node/test_dict.py \ + multi-node/test_ddict.py \ + multi-node/test_distdict.py \ + multi-node/test_fli.py \ multi-node/test_lock.py \ multi-node/test_machine.py \ multi-node/test_pool.py \ @@ -58,12 +60,13 @@ TESTS_PYTHON := test_c_files.py \ transport/test_lsif.py \ test_transport.py \ test_integration_shep_gs.py \ - test_launcher.py + test_launcher.py TESTS_DRAGON := test_mpbridge.py \ test_native.py \ - test_dict.py + test_dict.py \ + test_distdict.py .PHONY: all all: test diff --git a/test/channels_subtests/.gitignore b/test/channels_subtests/.gitignore index 9a1e062..e76a697 100644 --- a/test/channels_subtests/.gitignore +++ b/test/channels_subtests/.gitignore @@ -9,3 +9,6 @@ fch_test test_channelsets test_gateways test_fli +perf_fch +test_capnp + diff --git a/test/channels_subtests/Makefile b/test/channels_subtests/Makefile index 4015982..21ad45d 100644 --- a/test/channels_subtests/Makefile +++ b/test/channels_subtests/Makefile @@ -4,14 +4,14 @@ CFLAGS ?= -fPIC -Wall -Ofast -fomit-frame-pointer INCLUDE = -I $(DRAGON_INCLUDE_DIR) LIBS = -L $(DRAGON_LIB_DIR) -BIN_FILES = test_ch test_bch perf_fch test_send test_poll test_peek_pop test_channelsets test_wrong test_gateways test_gateway_messages test_fli +BIN_FILES = test_ch test_bch perf_fch test_send test_poll test_peek_pop test_channelsets test_wrong test_gateways test_gateway_messages test_fli test_capnp %.c.o: %.c $(CC) $(INCLUDE) $(CFLAGS) -c $< -o $@ default: build -build: test_ch test_bch perf_fch test_send test_poll test_channelsets test_wrong test_gateways test_gateway_messages test_fli +build: test_ch test_bch perf_fch test_send test_poll test_channelsets test_wrong test_gateways test_gateway_messages test_fli test_capnp test_ch: test_ch.c.o $(CC) $(INCLUDE) $(CFLAGS) -o test_ch $< $(LIBS) -ldragon -ldl @@ -46,5 +46,8 @@ test_gateway_messages: test_gateway_messages.c.o test_wrong: test_wrong.c.o $(CC) $(INCLUDE) $(CFLAGS) -o test_wrong $< $(LIBS) -ldragon -ldl +test_capnp: test_capnp.cpp + g++ $(INCLUDE) -std=c++14 -o test_capnp $< $(LIBS) -ldragon -ldl + clean: rm -rf *.o $(BIN_FILES) core __pycache__ diff --git a/test/channels_subtests/test_capnp.cpp b/test/channels_subtests/test_capnp.cpp new file mode 100644 index 0000000..fa145c8 --- /dev/null +++ b/test/channels_subtests/test_capnp.cpp @@ -0,0 +1,22 @@ +#include +#include +#include +#include + +// ./test_capnp > test.out +// capnp decode --packed ../../src/lib/message_defs.capnp DragonMessage < test.out +// Running this should print: +// ( header = (tc = 6, tag = 4, ref = 1, err = 0), +// registerClient = (test = "Hello World!") ) + +int main(int argc, char* argv[]) { + capnp::MallocMessageBuilder message; + + MessageDef::Builder msg = message.initRoot(); + msg.setTc(6); + msg.setTag(4); + DDRegisterClientDef::Builder rc = msg.initDdRegisterClient(); + rc.setRespFLI("Hello World!"); + capnp::writePackedMessageToFd(1, message); + return 0; +} \ No newline at end of file diff --git a/test/channels_subtests/test_capnp.py b/test/channels_subtests/test_capnp.py new file mode 100644 index 0000000..38218c5 --- /dev/null +++ b/test/channels_subtests/test_capnp.py @@ -0,0 +1,13 @@ +import capnp +import dragon.infrastructure.message_defs_capnp as schema + +def main(): + f = open('test.out', 'rb') + msg = schema.MessageDef.read_packed(f) + print(msg.which()) + print(msg.to_dict()) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/channels_subtests/test_fli.py b/test/channels_subtests/test_fli.py index a5898a4..8d25a55 100644 --- a/test/channels_subtests/test_fli.py +++ b/test/channels_subtests/test_fli.py @@ -45,6 +45,9 @@ def test_create_destroy_streaming(self): streams.append(strm) fli = FLInterface(main_ch=self.main_ch, manager_ch=manager_ch, pool=self.mpool, stream_channels=streams) + + self.assertEqual(fli.num_available_streams(), 5) + fli.destroy() # Clean up excess channels @@ -139,18 +142,17 @@ def test_create_close_recv_handle(self): recvh.close() def test_send_recv_bytes(self): - sendh = self.fli.sendh() - recvh = self.fli.recvh() - b = b'Hello World' - sendh.send_bytes(b) - sendh.close() - (x, _) = recvh.recv_bytes() # recv_bytes returns a tuple, first the bytes then the message attribute - self.assertEqual(b, x) + with self.fli.sendh() as sendh: + b = b'Hello World' + sendh.send_bytes(b) - with self.assertRaises(FLIEOT): - (x, _) = recvh.recv_bytes() # We should get back an EOT here - recvh.close() + with self.fli.recvh() as recvh: + (x, _) = recvh.recv_bytes() # recv_bytes returns a tuple, first the bytes then the message attribute + self.assertEqual(b, x) + + with self.assertRaises(FLIEOT): + (x, _) = recvh.recv_bytes() # We should get back an EOT here def test_send_recv_mem(self): sendh = self.fli.sendh() @@ -252,7 +254,5 @@ def test_pass_fli(self): self.assertEqual(42, hint) proc.join() - - if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/test/globalservices/group_api.py b/test/globalservices/group_api.py index 0c4512b..f5a6399 100644 --- a/test/globalservices/group_api.py +++ b/test/globalservices/group_api.py @@ -75,7 +75,7 @@ class GSGroupAPI(unittest.TestCase): def setUp(self) -> None: self.gs_stdout_rh, self.gs_stdout_wh = multiprocessing.Pipe(duplex=False) - self.node_sdesc = NodeDescriptor.make_for_current_node(is_primary=True).sdesc + self.node_sdesc = NodeDescriptor.get_localservices_node_conf(is_primary=True).sdesc self.some_parms = copy.copy(dparm.this_process) @@ -252,35 +252,73 @@ def create_wrap(the_exe, the_run_dir, the_args, the_env, the_name, result_list): def _send_get_responses(self, nitems, result): if result == 'fail': + shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHMultiProcessCreate) + responses = [] for i in range(nitems): - shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHProcessCreate) - shep_reply_msg = dmsg.SHProcessCreateResponse( - tag=self.next_tag(), ref=shep_msg.tag, err=dmsg.SHProcessCreateResponse.Errors.FAIL, - err_info='simulated failure' + responses.append( + dmsg.SHProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.procs[i].tag, + err=dmsg.SHProcessCreateResponse.Errors.FAIL, + err_info='simulated failure' + ) ) - self.gs_input_wh.send(shep_reply_msg.serialize()) + shep_reply_msg = dmsg.SHMultiProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.tag, + err=dmsg.SHMultiProcessCreateResponse.Errors.SUCCESS, + responses=responses + ) + self.gs_input_wh.send(shep_reply_msg.serialize()) elif result == 'success': + shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHMultiProcessCreate) + responses = [] for i in range(nitems): - shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHProcessCreate) - shep_reply_msg = dmsg.SHProcessCreateResponse( - tag=self.next_tag(), ref=shep_msg.tag, err=dmsg.SHProcessCreateResponse.Errors.SUCCESS + responses.append( + dmsg.SHProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.procs[i].tag, + err=dmsg.SHProcessCreateResponse.Errors.SUCCESS + ) ) - self.gs_input_wh.send(shep_reply_msg.serialize()) + shep_reply_msg = dmsg.SHMultiProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.tag, + err=dmsg.SHMultiProcessCreateResponse.Errors.SUCCESS, + responses=responses + ) + self.gs_input_wh.send(shep_reply_msg.serialize()) else: # we create half failed processes and half successful ones + shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHMultiProcessCreate) + + responses = [] for i in range(0, nitems//2): - shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHProcessCreate) - shep_reply_msg = dmsg.SHProcessCreateResponse( - tag=self.next_tag(), ref=shep_msg.tag, err=dmsg.SHProcessCreateResponse.Errors.SUCCESS + responses.append( + dmsg.SHProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.procs[i].tag, + err=dmsg.SHProcessCreateResponse.Errors.SUCCESS + ) ) - self.gs_input_wh.send(shep_reply_msg.serialize()) + for i in range(nitems//2, nitems): - shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHProcessCreate) - shep_reply_msg = dmsg.SHProcessCreateResponse( - tag=self.next_tag(), ref=shep_msg.tag, err=dmsg.SHProcessCreateResponse.Errors.FAIL, - err_info='simulated failure' + responses.append( + dmsg.SHProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.procs[i].tag, + err=dmsg.SHProcessCreateResponse.Errors.FAIL, + err_info='simulated failure' + ) ) - self.gs_input_wh.send(shep_reply_msg.serialize()) + + shep_reply_msg = dmsg.SHMultiProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.tag, + err=dmsg.SHMultiProcessCreateResponse.Errors.SUCCESS, + responses=responses + ) + self.gs_input_wh.send(shep_reply_msg.serialize()) def _create_group(self, group_items, group_policy, group_name, existing=False): def create_wrap(items, policy, the_name, result_list): @@ -675,12 +713,23 @@ def create_wrap(items, policy, the_name, result_list): self.assertEqual(int(descr.state), GroupDescriptor.State.ACTIVE) def _send_responses(self, nitems): - for i in range(nitems): - shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHProcessCreate) - shep_reply_msg = dmsg.SHProcessCreateResponse( - tag=self.next_tag(), ref=shep_msg.tag, err=dmsg.SHProcessCreateResponse.Errors.SUCCESS + shep_msg = tsu.get_and_check_type(self.shep_input_rh, dmsg.SHMultiProcessCreate) + responses = [] + for i in range(nitems): + responses.append( + dmsg.SHProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.procs[i].tag, + err=dmsg.SHProcessCreateResponse.Errors.SUCCESS ) - self.gs_input_wh.send(shep_reply_msg.serialize()) + ) + shep_reply_msg = dmsg.SHMultiProcessCreateResponse( + tag=self.next_tag(), + ref=shep_msg.tag, + err=dmsg.SHMultiProcessCreateResponse.Errors.SUCCESS, + responses=responses + ) + self.gs_input_wh.send(shep_reply_msg.serialize()) def test_create_add_to(self): # first create a group @@ -1153,4 +1202,4 @@ def test_get_list_alive_and_dead_groups(self): glist = get_list() # the returned list should be the same with the original one as get_list(() # returns both alive and dead groups - self.assertEqual(glist_orig, glist) + self.assertEqual(glist_orig, glist) \ No newline at end of file diff --git a/test/globalservices/process_api.py b/test/globalservices/process_api.py index 235fa96..6471508 100755 --- a/test/globalservices/process_api.py +++ b/test/globalservices/process_api.py @@ -70,7 +70,7 @@ class SingleProcAPIChannels(unittest.TestCase): def setUp(self) -> None: self.gs_stdout_rh, self.gs_stdout_wh = multiprocessing.Pipe(duplex=False) - self.node_sdesc = NodeDescriptor.make_for_current_node(is_primary=True).sdesc + self.node_sdesc = NodeDescriptor.get_localservices_node_conf(is_primary=True).sdesc self.some_parms = copy.copy(dparm.this_process) @@ -344,7 +344,7 @@ def test_dump_smoke(self): self.assertEqual(descr.name, "bob") - dump_msg = dmsg.GSDump(tag=self.next_tag(), filename="dump_file") + dump_msg = dmsg.GSDumpState(tag=self.next_tag(), filename="dump_file") self.gs_input_wh.send(dump_msg.serialize()) def test_query_by_name(self): diff --git a/test/globalservices/single_internal.py b/test/globalservices/single_internal.py index 43807ab..d21e53a 100755 --- a/test/globalservices/single_internal.py +++ b/test/globalservices/single_internal.py @@ -114,7 +114,7 @@ def setUp(self) -> None: self.gs_stdout_rh, self.gs_stdout_wh = multiprocessing.Pipe(duplex=False) self.shep_input_rh, self.shep_input_wh = multiprocessing.Pipe(duplex=False) self.bela_input_rh, self.bela_input_wh = multiprocessing.Pipe(duplex=False) - self.node_sdesc = NodeDescriptor.make_for_current_node(is_primary=True).sdesc + self.node_sdesc = NodeDescriptor.get_localservices_node_conf(is_primary=True).sdesc self.dut = None self.tag = 0 diff --git a/test/globalservices/single_process_msg.py b/test/globalservices/single_process_msg.py index db649e2..b57b0bd 100755 --- a/test/globalservices/single_process_msg.py +++ b/test/globalservices/single_process_msg.py @@ -81,7 +81,7 @@ def setUp(self) -> None: self.shep_input_wh = dconn.Connection(outbound_initializer=self.shep_input_chan) self.some_parms.local_shep_cd = B64.bytes_to_str(self.shep_input_chan.serialize()) - self.node_sdesc = NodeDescriptor.make_for_current_node(is_primary=True).sdesc + self.node_sdesc = NodeDescriptor.get_localservices_node_conf(is_primary=True).sdesc self.dut = None self.head_uid = None @@ -161,15 +161,9 @@ def _start_a_process(self, exe_name, the_args, the_name, the_tag, the_puid, the_ if pmi_required: self.assertIsNotNone(shep_msg.pmi_info) - self.assertEqual(shep_msg.pmi_info.job_id, pmi_info.job_id) self.assertEqual(shep_msg.pmi_info.lrank, pmi_info.lrank) self.assertEqual(shep_msg.pmi_info.ppn, pmi_info.ppn) self.assertEqual(shep_msg.pmi_info.nid, pmi_info.nid) - self.assertEqual(shep_msg.pmi_info.nnodes, pmi_info.nnodes) - self.assertEqual(shep_msg.pmi_info.nranks, pmi_info.nranks) - self.assertEqual(shep_msg.pmi_info.nidlist, pmi_info.nidlist) - self.assertEqual(shep_msg.pmi_info.hostlist, pmi_info.hostlist) - self.assertEqual(shep_msg.pmi_info.control_port, pmi_info.control_port) shep_reply_msg = dmsg.SHProcessCreateResponse( tag=self._tag_inc(), ref=shep_msg.tag, err=dmsg.SHProcessCreateResponse.Errors.SUCCESS @@ -447,16 +441,10 @@ def test_up_down_with_pmi_proc(self): the_puid=dfacts.LAUNCHER_PUID, the_rcuid=dfacts.BASE_BE_CUID, pmi_required=True, - pmi_info=dmsg.PMIInfo( - job_id=12323, + pmi_info=dmsg.PMIProcessInfo( lrank=0, ppn=1, nid=1, - nnodes=2, - nranks=2, - nidlist=[0,1], - hostlist=['head', 'worker'], - control_port=1023, pid_base=1 ) ) diff --git a/test/gpu/test_gpu.cpp b/test/gpu/test_gpu.cpp new file mode 100644 index 0000000..5b9c09d --- /dev/null +++ b/test/gpu/test_gpu.cpp @@ -0,0 +1,155 @@ +#include "../../src/lib/gpu/gpu.hpp" + +#include +#include +#include +#include + +#include "../_ctest_utils.h" + +#define NULL_BYTE 0x00 +#define NULL_VAL 0x0ul + +#define EXP_DST_BYTE 0xff +#define EXP_DST_VAL 0xfffffffffffffffful + +static void +check_result(dragonGPUHandle_t *gpuh, dragonError_t err, dragonError_t expected_err, int& tests_passed, int& tests_attempted, const char *file, int line) +{ + ++tests_attempted; + + if (err != expected_err) { + const int strlen = 256; + char errstr[strlen]; + dragon_gpu_get_errstr(gpuh, "GPU operation failed", err, errstr, strlen); + fprintf( + stderr, + "Test %d failed with error code %s in file %s at line %d\n", + tests_attempted, + dragon_get_rc_string(err), + file, line + ); + fprintf(stderr, "%s\n", errstr); + abort(); + } else { + ++tests_passed; + } +} + +int +main(int argc, char **argv) +{ + auto derr = DRAGON_SUCCESS; + auto ntests_passed = 0; + auto ntests_attempted = 0; + auto gpu_backend_type = DRAGON_GPU_BACKEND_CUDA; + + if (argc > 1) { + auto tmp_argstr = argv[1]; + if (0 == strcmp(tmp_argstr, "cuda")) { + gpu_backend_type = DRAGON_GPU_BACKEND_CUDA; + } else if (0 == strcmp(tmp_argstr, "hip")) { + gpu_backend_type = DRAGON_GPU_BACKEND_HIP; + } else if (0 == strcmp(tmp_argstr, "ze")) { + gpu_backend_type = DRAGON_GPU_BACKEND_ZE; + } + } + + dragonGPUHandle_t gpuh; + + derr = dragon_gpu_setup(gpu_backend_type, &gpuh); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + auto dst_addr = (void *)nullptr; + auto size = 8ul; + + derr = dragon_gpu_mem_alloc(&gpuh, &dst_addr, size); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + // start with a sanity test + + volatile uint64_t *dst_val = (volatile uint64_t *) malloc(sizeof(uint64_t)); + assert(dst_val != nullptr); + + *dst_val = 0ul; + + derr = dragon_gpu_memset(&gpuh, dst_addr, EXP_DST_BYTE, size); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_copy(&gpuh, (void *) dst_val, dst_addr, size, DRAGON_GPU_D2H); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + assert(*dst_val == EXP_DST_VAL); + *dst_val = 0ul; + + // now clear dst_addr for the main test + + derr = dragon_gpu_memset(&gpuh, dst_addr, NULL_BYTE, size); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + // get ipc handle to be used by child + + dragonIPCHandle_t ipc_handle; + + derr = dragon_gpu_get_ipc_handle(&gpuh, dst_addr, &ipc_handle); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + if (auto pid = fork()) { // parent + // wait for flag to be set by child + + while (*dst_val == NULL_VAL) { + derr = dragon_gpu_copy(&gpuh, (void *) dst_val, dst_addr, size, DRAGON_GPU_D2H); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + } + assert(*dst_val == EXP_DST_VAL); + + // let the child detach from dst_addr before we clean up + int wstatus; + wait(&wstatus); + + derr = dragon_gpu_free_ipc_handle(&gpuh, &ipc_handle); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_mem_free(&gpuh, dst_addr); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_cleanup(&gpuh); + check_result(&gpuh, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + } else { // child + // attach to dst_addr and write flag for parent + + dragonGPUHandle_t gpuh_child; + + derr = dragon_gpu_setup(gpu_backend_type, &gpuh_child); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_attach(&gpuh_child, &ipc_handle, &dst_addr); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + auto src_addr = (void *)nullptr; + + derr = dragon_gpu_mem_alloc(&gpuh_child, &src_addr, size); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_memset(&gpuh_child, src_addr, EXP_DST_BYTE, size); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_copy(&gpuh_child, dst_addr, src_addr, size, DRAGON_GPU_D2D); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_detach(&gpuh_child, dst_addr); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_mem_free(&gpuh_child, src_addr); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + derr = dragon_gpu_cleanup(&gpuh_child); + check_result(&gpuh_child, derr, DRAGON_SUCCESS, ntests_passed, ntests_attempted, __FILE__, __LINE__); + + exit(EXIT_SUCCESS); + } + + fprintf(stdout, "%d out of %d tests passsed\n", ntests_passed, ntests_attempted); + fflush(stdout); +} + diff --git a/test/launcher/backend_testing_mocks.py b/test/launcher/backend_testing_mocks.py index 1adfe4e..9d67c1d 100644 --- a/test/launcher/backend_testing_mocks.py +++ b/test/launcher/backend_testing_mocks.py @@ -673,9 +673,9 @@ def send_OverlayPingBE(self): self.log.debug(f"send_OverlayPingBE sent {overlay_ping_be=}") def recv_TAUpdateNodes(self): - be_hsta_update_nodes = get_with_blocking(self.be_ta_conn) - assert isinstance(be_hsta_update_nodes, dmsg.TAUpdateNodes), "expected TAUpdateNodes" - self.log.debug(f"recv_TAUpdateNodes got {be_hsta_update_nodes=}") + be_ta_update_nodes = get_with_blocking(self.be_ta_conn) + assert isinstance(be_ta_update_nodes, dmsg.TAUpdateNodes), "expected TAUpdateNodes" + self.log.debug(f"recv_TAUpdateNodes got {be_ta_update_nodes=}") def recv_BEHaltOverlay(self): # M22 be_halt_overlay = get_with_blocking(self.be_ta_conn) diff --git a/test/launcher/frontend_testing_mocks.py b/test/launcher/frontend_testing_mocks.py index 92c2e8d..34c038b 100644 --- a/test/launcher/frontend_testing_mocks.py +++ b/test/launcher/frontend_testing_mocks.py @@ -1,9 +1,11 @@ import os import logging +from typing import Optional from dragon.launcher.frontend import LauncherFrontEnd from dragon.launcher.util import next_tag from dragon.launcher.network_config import NetworkConfig +from dragon.launcher.dragon_multi_fe import main as frontend_main from dragon.infrastructure.process_desc import ProcessDescriptor from dragon.infrastructure.connection import Connection, ConnectionOptions @@ -35,6 +37,11 @@ def run_frontend(args_map): fe_server.run_msg_server() +def run_resilient_frontend(args_map): + + frontend_main(args_map=args_map) + + def open_overlay_comms(ch_in_desc: B64, ch_out_desc: B64): '''Attach to Frontend's overlay network channels and open a connection''' @@ -54,8 +61,10 @@ def open_overlay_comms(ch_in_desc: B64, def open_backend_comms(frontend_sdesc: str, - network_config: str): + network_config: str, + net_conf: Optional[dict] = None): '''Attach to frontend channel and create channels for backend''' + log = logging.getLogger('open_backend_comms') be_mpool = None try: conn_options = ConnectionOptions(min_block_size=2 ** 16) @@ -68,25 +77,33 @@ def open_backend_comms(frontend_sdesc: str, be_ch_out = Channel.attach(B64.from_str(frontend_sdesc).decode(), mem_pool=be_mpool) - net = NetworkConfig.from_file(network_config) - net_conf = net.get_network_config() + if net_conf is None: + net = NetworkConfig.from_file(network_config) + net_conf = net.get_network_config() be_nodes = {} - for node in net_conf.values(): - be_cuid = dfacts.be_fe_cuid_from_hostid(node.host_id) - - # Add this to the object as it lets teardown know it needs to clean this up - be_ch_in = Channel(be_mpool, be_cuid) - - overlay_inout = Connection(inbound_initializer=be_ch_in, - outbound_initializer=be_ch_out, - options=conn_options, - policy=conn_policy) - overlay_inout.ghost = True - be_nodes[node.host_id] = {'conn': overlay_inout, - 'ch_in': be_ch_in, - 'hostname': node.name, - 'ip_addrs': node.ip_addrs} + log.info(f'net_conf in open_backend_comms: {net_conf}') + for idx, node in net_conf.items(): + + if node.state == NodeDescriptor.State.ACTIVE: + be_cuid = dfacts.be_fe_cuid_from_hostid(node.host_id) + + # Add this to the object as it lets teardown know it needs to clean this up + be_ch_in = Channel(be_mpool, be_cuid) + + overlay_inout = Connection(inbound_initializer=be_ch_in, + outbound_initializer=be_ch_out, + options=conn_options, + policy=conn_policy) + overlay_inout.ghost = True + be_nodes[node.host_id] = {'conn': overlay_inout, + 'ch_in': be_ch_in, + 'hostname': node.name, + 'ip_addrs': node.ip_addrs, + 'state': node.state, + 'node_index': idx, + 'is_primary': node.is_primary} + log.debug(f'constructed backend node: {be_nodes[node.host_id]}') except (ChannelError, DragonPoolError, DragonMemoryError) as init_err: # Try to clean up the pool @@ -110,18 +127,20 @@ def send_beisup(nodes): def recv_fenodeidx(nodes): '''recv FENoe4deIdxBE and finish filling out node dictionary''' log = logging.getLogger('recv_fe_nodeidx') - for node in nodes.values(): - fe_node_idx_msg = dmsg.parse(node['conn'].recv()) - assert isinstance(fe_node_idx_msg, dmsg.FENodeIdxBE), 'la_be node_index from fe expected' - - log.info(f'got FENodeIdxBE for index {fe_node_idx_msg.node_index}') - node['node_index'] = fe_node_idx_msg.node_index - if node['node_index'] < 0: - raise RuntimeError("frontend giving bad node indices") - node['is_primary'] = node['node_index'] == 0 - if node['is_primary']: - primary_conn = node['conn'] - log.info(f'constructed be node: {node}') + host_ids = [key for key in nodes.keys()] + for idx, _ in enumerate(host_ids): + if nodes[host_ids[idx]]['state'] == NodeDescriptor.State.ACTIVE: + fe_node_idx_msg = dmsg.parse(nodes[host_ids[idx]]['conn'].recv()) + assert isinstance(fe_node_idx_msg, dmsg.FENodeIdxBE), 'la_be node_index from fe expected' + + log.info(f'got FENodeIdxBE for index {fe_node_idx_msg.node_index}') + nodes[host_ids[idx]]['node_index'] = fe_node_idx_msg.node_index + if nodes[host_ids[idx]]['node_index'] < 0: + raise RuntimeError("frontend giving bad node indices") + nodes[host_ids[idx]]['is_primary'] = nodes[host_ids[idx]]['node_index'] == 0 + if nodes[host_ids[idx]]['is_primary']: + primary_conn = nodes[host_ids[idx]]['conn'] + log.info(f'constructed be node: {nodes[host_ids[idx]]}') return primary_conn @@ -169,8 +188,8 @@ def send_taup(nodes): node['conn'].send(ta_up.serialize()) -def send_abnormal_term(conn): - abnorm = dmsg.AbnormalTermination(tag=next_tag()) +def send_abnormal_term(conn, host_id=0): + abnorm = dmsg.AbnormalTermination(tag=next_tag(), host_id=host_id) conn.send(abnorm.serialize()) @@ -212,43 +231,43 @@ def handle_gsprocesscreate_error(primary_conn): primary_conn.send(response.serialize()) -def stand_up_backend(mock_overlay, mock_launch, network_config): +def stand_up_backend(mock_overlay, mock_launch, network_config, net_conf=None): log = logging.getLogger('mock_backend_standup') # Get the mock's input args to the - while mock_overlay.call_args is None: + while len(mock_overlay.call_args_list) == 0: pass - overlay_args = mock_overlay.call_args.kwargs - + overlay_args = mock_overlay.call_args_list.pop().kwargs overlay = {} # Connect to overlay comms to talk to fronteend overlay['ta_ch_in'], overlay['ta_ch_out'], overlay['fe_ta_conn'] = open_overlay_comms(overlay_args['ch_in_sdesc'], - overlay_args['ch_out_sdesc']) + overlay_args['ch_out_sdesc']) # Let frontend know the overlay is "up" overlay['fe_ta_conn'].send(dmsg.OverlayPingLA(next_tag()).serialize()) # Grab the frontend channel descriptor for the launched backend and # send it mine - while mock_launch.call_args is None: + while len(mock_launch.call_args_list) == 0: pass - launch_be_args = mock_launch.call_args.kwargs + launch_be_args = mock_launch.call_args_list.pop().kwargs log.info(f'got be args: {launch_be_args}') # Connect to backend comms for frontend-to-backend and back comms overlay['be_mpool'], overlay['be_ch_out'], overlay['be_ch_in'], overlay['be_nodes'], overlay['overlay_inout'] = open_backend_comms(launch_be_args['frontend_sdesc'], - network_config) + network_config, + net_conf=net_conf) log.info('got backend up') return overlay -def handle_bringup(mock_overlay, mock_launch, network_config): +def handle_bringup(mock_overlay, mock_launch, network_config, net_conf=None): log = logging.getLogger('mock_fulL_bringup') - overlay = stand_up_backend(mock_overlay, mock_launch, network_config) + overlay = stand_up_backend(mock_overlay, mock_launch, network_config, net_conf=net_conf) # Send BEIsUp send_beisup(overlay['be_nodes']) @@ -281,7 +300,12 @@ def handle_overlay_teardown(overlay_conn): '''complete teardown of frontend overlay process''' halt_on = dmsg.parse(overlay_conn.recv()) assert isinstance(halt_on, dmsg.LAHaltOverlay) - overlay_conn.send(dmsg.OverlayHalted(tag=next_tag()).serialize()) + + # This may fail depending on the testing infrastructure + try: + overlay_conn.send(dmsg.OverlayHalted(tag=next_tag()).serialize()) + except ChannelError: + pass def handle_teardown(nodes, primary_conn, overlay_conn, diff --git a/test/launcher/slurm_big.yaml b/test/launcher/slurm_big.yaml new file mode 100644 index 0000000..db22550 --- /dev/null +++ b/test/launcher/slurm_big.yaml @@ -0,0 +1,288 @@ +'0': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 9555388855352840172 + host_name: pinoak0019 + ip_addrs: + - 10.150.0.84:6565 + - 10.150.0.87:6565 + - 10.150.0.85:6565 + - 10.150.0.86:6565 + is_primary: true + name: pinoak0019 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'1': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 12533491927761989908 + host_name: pinoak0014 + ip_addrs: + - 10.150.0.66:6565 + - 10.150.0.64:6565 + - 10.150.0.65:6565 + - 10.150.0.52:6565 + is_primary: false + name: pinoak0014 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'10': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 12810521104317446107 + host_name: pinoak0023 + ip_addrs: + - 10.150.0.98:6565 + - 10.150.0.97:6565 + - 10.150.0.99:6565 + - 10.150.0.100:6565 + is_primary: false + name: pinoak0023 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'11': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 11791745478168178748 + host_name: pinoak0024 + ip_addrs: + - 10.150.0.108:6565 + - 10.150.0.109:6565 + - 10.150.0.105:6565 + - 10.150.0.106:6565 + is_primary: false + name: pinoak0024 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'12': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 11887869611103826773 + host_name: pinoak0025 + ip_addrs: + - 10.150.0.103:6565 + - 10.150.0.107:6565 + - 10.150.0.110:6565 + - 10.150.0.104:6565 + is_primary: false + name: pinoak0025 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'13': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 11429804208641850389 + host_name: pinoak0026 + ip_addrs: + - 10.150.0.112:6565 + - 10.150.0.113:6565 + - 10.150.0.115:6565 + - 10.150.0.114:6565 + is_primary: false + name: pinoak0026 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'14': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 12138584579173682678 + host_name: pinoak0027 + ip_addrs: + - 10.150.0.116:6565 + - 10.150.0.117:6565 + - 10.150.0.118:6565 + - 10.150.0.119:6565 + is_primary: false + name: pinoak0027 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'15': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 12476220481471440359 + host_name: pinoak0028 + ip_addrs: + - 10.150.0.111:6565 + - 10.150.0.120:6565 + - 10.150.0.121:6565 + - 10.150.0.130:6565 + is_primary: false + name: pinoak0028 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'2': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 12401196674856057593 + host_name: pinoak0015 + ip_addrs: + - 10.150.0.69:6565 + - 10.150.0.70:6565 + - 10.150.0.72:6565 + - 10.150.0.71:6565 + is_primary: false + name: pinoak0015 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'3': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 11113062785457620995 + host_name: pinoak0016 + ip_addrs: + - 10.150.0.73:6565 + - 10.150.0.74:6565 + - 10.150.0.75:6565 + - 10.150.0.76:6565 + is_primary: false + name: pinoak0016 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'4': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 13634311004798939788 + host_name: pinoak0017 + ip_addrs: + - 10.150.0.77:6565 + - 10.150.0.67:6565 + - 10.150.0.78:6565 + - 10.150.0.68:6565 + is_primary: false + name: pinoak0017 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'5': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 10312278666553090270 + host_name: pinoak0018 + ip_addrs: + - 10.150.0.80:6565 + - 10.150.0.82:6565 + - 10.150.0.83:6565 + - 10.150.0.81:6565 + is_primary: false + name: pinoak0018 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'6': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 12224412918854258471 + host_name: pinoak0013 + ip_addrs: + - 10.150.0.61:6565 + - 10.150.0.57:6565 + - 10.150.0.62:6565 + - 10.150.0.63:6565 + is_primary: false + name: pinoak0013 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'7': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 10158260064294485126 + host_name: pinoak0020 + ip_addrs: + - 10.150.0.90:6565 + - 10.150.0.89:6565 + - 10.150.0.79:6565 + - 10.150.0.88:6565 + is_primary: false + name: pinoak0020 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'8': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 10007355517454728825 + host_name: pinoak0021 + ip_addrs: + - 10.150.0.91:6565 + - 10.150.0.92:6565 + - 10.150.0.93:6565 + - 10.150.0.94:6565 + is_primary: false + name: pinoak0021 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 +'9': + accelerators: null + cpu_devices: null + h_uid: null + host_id: 12583076385421469932 + host_name: pinoak0022 + ip_addrs: + - 10.150.0.96:6565 + - 10.150.0.95:6565 + - 10.150.0.101:6565 + - 10.150.0.102:6565 + is_primary: false + name: pinoak0022 + num_cpus: 0 + overlay_cd: '' + physical_mem: 0 + shep_cd: '' + state: 4 diff --git a/test/launcher/slurm_primary.yaml b/test/launcher/slurm_primary.yaml index e78182a..4f647b0 100644 --- a/test/launcher/slurm_primary.yaml +++ b/test/launcher/slurm_primary.yaml @@ -5,6 +5,7 @@ - 10.128.0.5:6565 is_primary: false name: nid00004 + host_name: nid00004 num_cpus: 0 physical_mem: 0 shep_cd: '' @@ -16,6 +17,7 @@ - 10.128.0.6:6565 is_primary: true name: nid00005 + host_name: nid00005 num_cpus: 0 physical_mem: 0 shep_cd: '' @@ -27,6 +29,7 @@ - 10.128.0.7:6565 is_primary: false name: nid00006 + host_name: nid00006 num_cpus: 0 physical_mem: 0 shep_cd: '' @@ -38,6 +41,7 @@ - 10.128.0.8:6565 is_primary: false name: nid00007 + host_name: nid00007 num_cpus: 0 physical_mem: 0 shep_cd: '' diff --git a/test/launcher/test_frontend_bringup.py b/test/launcher/test_frontend_bringup.py index caa687c..d3c9207 100644 --- a/test/launcher/test_frontend_bringup.py +++ b/test/launcher/test_frontend_bringup.py @@ -171,7 +171,6 @@ def test_clean_exit(self, mock_overlay, mock_launch): # Join on the frontend thread fe_proc.join() - @unittest.skip('HSTA not currently supported in open source.') @catch_thread_exceptions @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') @patch('dragon.launcher.frontend.start_overlay_network') @@ -192,7 +191,7 @@ def test_error_launching_head_process(self, exceptions_caught_in_threads, mock_o la_info = self.do_bringup(mock_overlay, mock_launch) # Check we launched the backend with default transport - self.assertEqual(la_info.transport, TransportAgentOptions.HSTA) + self.assertEqual(la_info.transport, TransportAgentOptions.TCP) # Receive GSProcessCreate handle_gsprocesscreate_error(self.primary_conn) @@ -205,7 +204,7 @@ def test_error_launching_head_process(self, exceptions_caught_in_threads, mock_o assert exceptions_caught_in_threads['Frontend Server']['exception']['type'] == RuntimeError assert str(exceptions_caught_in_threads['Frontend Server']['exception']['value']) == 'Abnormal exit detected' - @unittest.skip('HSTA not currently supported in open source.') + @unittest.skip('Not supported in open source') @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') @patch('dragon.launcher.frontend.start_overlay_network') def test_clean_exit_with_hsta_launch(self, mock_overlay, mock_launch): diff --git a/test/launcher/test_launch_options.py b/test/launcher/test_launch_options.py index be84d03..fd07b6a 100644 --- a/test/launcher/test_launch_options.py +++ b/test/launcher/test_launch_options.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import unittest +import time import logging import threading from os import environ, path @@ -123,6 +124,7 @@ class LaunchOptionsTest(unittest.TestCase): def setUp(self): self.test_dir = path.dirname(path.realpath(__file__)) self.network_config = path.join(self.test_dir, 'slurm_primary.yaml') + self.big_network_config = path.join(self.test_dir, 'slurm_big.yaml') @catch_thread_exceptions @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') @@ -147,6 +149,79 @@ def test_too_many_nodes_requested(self, exceptions_caught_in_threads, mock_overl assert str(exceptions_caught_in_threads['Frontend Server']['exception']['value']) \ == 'Not enough backend nodes allocated to match requested' + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_resilient_launch_no_nodes_requested(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + """What happens when a user requests resilient launch but doesn't specify # nodes""" + + args_map = get_args_map(self.network_config, + arg1=['--resilient']) + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + fe_proc.join() + + assert 'Frontend Server' in exceptions_caught_in_threads # there was an exception in thread 1 + assert exceptions_caught_in_threads['Frontend Server']['exception']['type'] == RuntimeError + assert str(exceptions_caught_in_threads['Frontend Server']['exception']['value']) \ + == "resilient flag requires setting of '--nodes' or '--idle'" + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_resilient_launch_bad_nidle_requested(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + """What happens when a user requests resilient launch but request too many idle nodes""" + + args_map = get_args_map(self.network_config, + arg1=['--resilient'], + arg2=['--idle', '1000000']) + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + fe_proc.join() + + assert 'Frontend Server' in exceptions_caught_in_threads # there was an exception in thread 1 + assert exceptions_caught_in_threads['Frontend Server']['exception']['type'] == RuntimeError + assert "is greater than available" in \ + str(exceptions_caught_in_threads['Frontend Server']['exception']['value']) + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_resilient_launch_bad_nidle_nnodes_requested(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + """What happens when a user requests resilient launch but requests too many idle nodes for nnodes""" + + self.network_config = self.big_network_config + args_map = get_args_map(self.network_config, + arg1=['--resilient'], + arg2=['--idle', '13'], + arg3=['--nodes', '4']) + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + fe_proc.join() + + assert 'Frontend Server' in exceptions_caught_in_threads # there was an exception in thread 1 + assert exceptions_caught_in_threads['Frontend Server']['exception']['type'] == RuntimeError + assert "is greater than available" in \ + str(exceptions_caught_in_threads['Frontend Server']['exception']['value']) + @patch('sys.stderr', new_callable=StringIO) def test_nonint_nodes_requested(self, mock_stderr): """what happens when users passes a non-int for # nodes""" diff --git a/test/launcher/test_resilient_restart.py b/test/launcher/test_resilient_restart.py new file mode 100644 index 0000000..0b77986 --- /dev/null +++ b/test/launcher/test_resilient_restart.py @@ -0,0 +1,639 @@ +#!/usr/bin/env python3 +import os +import sys +import logging +import unittest +import threading +import random +from unittest.mock import patch +from io import StringIO + +from dragon.launcher.launchargs import get_parser + +from dragon.infrastructure.node_desc import NodeDescriptor +from dragon.launcher.network_config import NetworkConfig +from dragon.channels import ChannelError +from dragon.managed_memory import DragonMemoryError + +from .launcher_testing_utils import catch_thread_exceptions + +from .frontend_testing_mocks import run_resilient_frontend +from .frontend_testing_mocks import handle_teardown +from .frontend_testing_mocks import handle_gsprocesscreate, handle_bringup, stand_up_backend +from .frontend_testing_mocks import send_abnormal_term + + +def get_args_map(network_config, **kwargs): + + parser = get_parser() + arg_list = ['--wlm', 'slurm', + '--network-config', f'{network_config}', + '--network-prefix', '^(eth|hsn)'] + for val in kwargs.values(): + arg_list = arg_list + val + + arg_list.append('hello_world.py') + + args = parser.parse_args(args=arg_list) + if args.basic_label or args.verbose_label: + args.no_label = False + args_map = {key: value for key, value in vars(args).items() if value is not None} + + return args_map + + +class FrontendRestartTest(unittest.TestCase): + + def setUp(self): + self.test_dir = os.path.dirname(os.path.realpath(__file__)) + self.network_config = os.path.join(self.test_dir, 'slurm_primary.yaml') + self.bad_network_config = os.path.join(self.test_dir, 'slurm_bad.yaml') + self.big_network_config = os.path.join(self.test_dir, 'slurm_big.yaml') + + self.be_mpool = None + self.be_ch_out = None + self.be_ch_in = None + self.overlay_inout = None + + self.ls_ch = None + + self.ta_ch_in = None + self.ta_ch_out = None + self.fe_ta_conn = None + + def tearDown(self): + + self.cleanup() + + def cleanup(self): + + try: + self.fe_ta_conn.close() + except (ConnectionError, AttributeError): + pass + + try: + self.be_ch_out.detach() + except (ChannelError, AttributeError): + pass + + try: + for node in self.be_nodes.values(): + node['conn'].close() + node['ch_in'].destroy() + except (AttributeError, ChannelError): + pass + + try: + for node in self.be_nodes.values(): + node['ls_ch'].destroy() + if node['is_primary']: + node['gs_ch'].destroy() + except (AttributeError, ChannelError, KeyError): + pass + + try: + self.ta_ch_out.detach() + except (AttributeError, ChannelError): + pass + + try: + self.ta_ch_in.detach() + except (AttributeError, ChannelError): + pass + + try: + self.be_mpool.destroy() + del self.be_mpool + except (AttributeError, DragonMemoryError): + pass + + def do_bringup(self, mock_overlay, mock_launch, net_conf=None): + + overlay, la_info = handle_bringup(mock_overlay, + mock_launch, + self.network_config, + net_conf=net_conf) + self.ta_ch_in = overlay['ta_ch_in'] + self.ta_ch_out = overlay['ta_ch_out'] + self.fe_ta_conn = overlay['fe_ta_conn'] + self.be_mpool = overlay['be_mpool'] + self.be_ch_out = overlay['be_ch_out'] + self.be_ch_in = overlay['be_ch_in'] + self.be_nodes = overlay['be_nodes'] + self.overlay_inout = overlay['overlay_inout'] + self.primary_conn = overlay['primary_conn'] + + return la_info + + def get_backend_up(self, mock_overlay, mock_launch): + + overlay = stand_up_backend(mock_overlay, mock_launch, self.network_config) + self.ta_ch_in = overlay['ta_ch_in'] + self.ta_ch_out = overlay['ta_ch_out'] + self.fe_ta_conn = overlay['fe_ta_conn'] + self.be_mpool = overlay['be_mpool'] + self.be_ch_out = overlay['be_ch_out'] + self.be_ch_in = overlay['be_ch_in'] + self.be_nodes = overlay['be_nodes'] + self.overlay_inout = overlay['overlay_inout'] + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_abnormal_restart_no_promotion(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + '''Test the ability of frontend to restart from an Abnormal Term, excluding the node that sent the signal with no replacement''' + + args_map = get_args_map(self.network_config, + arg1=['--resilient', '--exhaust-resources'], + arg2=['--nodes', '4'], + arg3=['-l', 'dragon_file=DEBUG'], + arg4=['-l', 'actor_file=DEBUG']) + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_resilient_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend up + self.do_bringup(mock_overlay, mock_launch) + + # Receive GSProcessCreate + handle_gsprocesscreate(self.primary_conn) + + # Send an abormal termination rather than proceeding with teardown + dropped_host_id = 0 + dropped_index = 2 + for i, (host_id, node) in enumerate(self.be_nodes.items()): + if i == dropped_index: + dropped_host_id = host_id + send_abnormal_term(node['conn'], host_id=host_id) + + # Necessarily clean up all the backend stuff: + self.cleanup() + + # Construct my own network configuration and set the State manually so + # the backend mocks know how to behave + net = NetworkConfig.from_file(self.network_config) + net_conf = net.get_network_config() + + for node in net_conf.values(): + if node.host_id == dropped_host_id: + node.state = NodeDescriptor.State.DOWN + + # Get backend back up for the resilient launch + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Check that the frontend gave us the expected config + self.assertEqual(len(self.be_nodes), len(net_conf) - 1) + for host_id, node in self.be_nodes.items(): + self.assertNotEqual(host_id, dropped_host_id) + + handle_gsprocesscreate(self.primary_conn) + handle_teardown(self.be_nodes, self.primary_conn, self.fe_ta_conn) + + # Join on the frontend thread + fe_proc.join() + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_abnormal_restart_with_promotion(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + '''Test the ability of frontend to restart from an Abnormal Term, excluding the node that sent the signal with a replacement''' + + nnodes = 4 + self.network_config = self.big_network_config + args_map = get_args_map(self.network_config, + arg1=['--resilient'], + arg2=['--nodes', f'{nnodes}'], + arg3=['-l', 'dragon_file=DEBUG'], + arg4=['-l', 'actor_file=DEBUG']) + + # Construct our node list: + net = NetworkConfig.from_file(self.network_config) + net_conf = net.get_network_config() + + active_nodes = 0 + for node in net_conf.values(): + if active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_resilient_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend up + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Receive GSProcessCreate + handle_gsprocesscreate(self.primary_conn) + + # Send an abormal termination rather than proceeding with teardown + dropped_host_id = 0 + dropped_index = 2 + for i, (host_id, node) in enumerate(self.be_nodes.items()): + if i == dropped_index: + dropped_host_id = host_id + send_abnormal_term(node['conn'], host_id=host_id) + + # Necessarily clean up all the backend stuff: + self.cleanup() + + active_nodes = 0 + for node in net_conf.values(): + if node.host_id == dropped_host_id: + node.state = NodeDescriptor.State.DOWN + elif active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # Get backend back up for the resilient launch + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Check that the frontend gave us the expected config + self.assertEqual(len(self.be_nodes), nnodes) + for host_id, node in self.be_nodes.items(): + self.assertNotEqual(host_id, dropped_host_id) + + handle_gsprocesscreate(self.primary_conn) + handle_teardown(self.be_nodes, self.primary_conn, self.fe_ta_conn) + + # Join on the frontend thread + fe_proc.join() + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_abnormal_restart_with_promotion_and_idle_nodes(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + '''Test the ability of frontend to restart from an Abnormal Term, excluding the node that sent the signal with a replacement''' + + nnodes = 4 + idle_nodes = 12 + self.network_config = self.big_network_config + args_map = get_args_map(self.network_config, + arg1=['--resilient'], + arg2=['--nodes', f'{nnodes}'], + arg3=['--idle', f'{idle_nodes}'], + arg4=['-l', 'dragon_file=DEBUG'], + arg5=['-l', 'actor_file=DEBUG']) + + # Construct our node list: + net = NetworkConfig.from_file(self.network_config) + net_conf = net.get_network_config() + + active_nodes = 0 + for node in net_conf.values(): + if active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_resilient_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend up + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Receive GSProcessCreate + handle_gsprocesscreate(self.primary_conn) + + # Send an abormal termination rather than proceeding with teardown + dropped_host_id = 0 + dropped_index = 2 + for i, (host_id, node) in enumerate(self.be_nodes.items()): + if i == dropped_index: + dropped_host_id = host_id + send_abnormal_term(node['conn'], host_id=host_id) + + # Necessarily clean up all the backend stuff: + self.cleanup() + + active_nodes = 0 + for node in net_conf.values(): + if node.host_id == dropped_host_id: + node.state = NodeDescriptor.State.DOWN + elif active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # Get backend back up for the resilient launch + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Check that the frontend gave us the expected config + self.assertEqual(len(self.be_nodes), nnodes) + for host_id, node in self.be_nodes.items(): + self.assertNotEqual(host_id, dropped_host_id) + + handle_gsprocesscreate(self.primary_conn) + handle_teardown(self.be_nodes, self.primary_conn, self.fe_ta_conn) + + # Join on the frontend thread + fe_proc.join() + + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_abnormal_restart_kill_global_services(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + '''Test the ability of frontend to restart from an Abnormal Term when downed node is global services''' + + nnodes = 4 + self.network_config = self.big_network_config + args_map = get_args_map(self.network_config, + arg1=['--resilient'], + arg2=['--nodes', f'{nnodes}'], + arg3=['-l', 'dragon_file=DEBUG'], + arg4=['-l', 'actor_file=DEBUG']) + + # Construct our node list: + net = NetworkConfig.from_file(self.network_config) + net_conf = net.get_network_config() + + active_nodes = 0 + for node in net_conf.values(): + if active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_resilient_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend up + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Receive GSProcessCreate + handle_gsprocesscreate(self.primary_conn) + + # Send an abormal termination to global services + dropped_host_id = 0 + dropped_index = [int(k) for k, v in net_conf.items() if v.is_primary][0] + for i, (host_id, node) in enumerate(self.be_nodes.items()): + if i == dropped_index: + dropped_host_id = host_id + send_abnormal_term(node['conn'], host_id=host_id) + + # Necessarily clean up all the backend stuff: + self.cleanup() + + # Update our own internal tracking to match what should be in the launcher's + # network configuration + active_nodes = 0 + for node in net_conf.values(): + if node.host_id == dropped_host_id: + node.state = NodeDescriptor.State.DOWN + elif active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # Get backend back up for the resilient launch + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Check that the frontend gave us the expected config + self.assertEqual(len(self.be_nodes), nnodes) + for host_id, node in self.be_nodes.items(): + self.assertNotEqual(host_id, dropped_host_id) + + # Check that some node is selected as primary + self.assertTrue(any([node['is_primary'] for node in self.be_nodes.values()])) + + # Do the rest of bring-up and teardown + handle_gsprocesscreate(self.primary_conn) + handle_teardown(self.be_nodes, self.primary_conn, self.fe_ta_conn) + + # Join on the frontend thread + fe_proc.join() + + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_abnormal_restart_exhaust_resources(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + '''Test the ability of frontend to restart until there are no nodes left to use''' + + nnodes = 4 + self.network_config = self.big_network_config + args_map = get_args_map(self.network_config, + arg1=['--resilient', '--exhaust-resources'], + arg2=['--nodes', f'{nnodes}'], + arg3=['-l', 'dragon_file=DEBUG'], + arg4=['-l', 'actor_file=DEBUG']) + + # Construct our node list: + net = NetworkConfig.from_file(self.network_config) + net_conf = net.get_network_config() + all_nodes = len(net_conf) + + active_nodes = 0 + for node in net_conf.values(): + if active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + current_node_count = nnodes + alive_hosts = [node.host_id for node in net_conf.values()] + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_resilient_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend + dropped_index = 2 + dropped_host_id = 0 + + log = logging.getLogger('test') + for index in range(all_nodes): + + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Receive GSProcessCreate + log.info('Creating head proc') + handle_gsprocesscreate(self.primary_conn) + log.info('Head proc created') + + # Check that the frontend gave us the expected config + self.assertEqual(len(self.be_nodes), nnodes) + for host_id, node in self.be_nodes.items(): + self.assertNotEqual(host_id, dropped_host_id) + + # if the last execution, grab stdout to make sure the correct message is printed + if index == all_nodes - 1: + captured_stdout = StringIO() + sys.stdout = captured_stdout + + # Send an abormal termination rather than proceeding with teardown + if dropped_index > nnodes - 1: + dropped_index = nnodes - 1 + for i, (host_id, node) in enumerate(self.be_nodes.items()): + if i == dropped_index: + dropped_host_id = host_id + log.info(f"sending a abnormal signal to {host_id}: {node}") + send_abnormal_term(node['conn'], host_id=host_id) + + # Necessarily clean up all the backend stuff: + log.info('doing cleanup of mocks') + self.cleanup() + + # Update our internal ref on the node states + active_nodes = 0 + log.info('updating net conf') + for node in net_conf.values(): + if node.state != NodeDescriptor.State.DOWN: + if node.host_id == dropped_host_id: + log.info(f"marking {dropped_host_id} down in net_conf") + node.state = NodeDescriptor.State.DOWN + elif active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # Update the node count if we have fewer than originally proposed + if active_nodes < nnodes: + nnodes = active_nodes + + # Join on the frontend thread + fe_proc.join() + + # Reset stdout and check captured output + sys.stdout = sys.__stdout__ + self.assertTrue("There are no more hardware resources available for continued app execution." in captured_stdout.getvalue()) + + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_abnormal_restart_min_nodes(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + '''Test the ability of frontend to restart until there are not enough nodes left for user requested node count''' + + nnodes = 4 + self.network_config = self.big_network_config + args_map = get_args_map(self.network_config, + arg1=['--resilient'], + arg2=['--nodes', f'{nnodes}'], + arg3=['-l', 'dragon_file=DEBUG'], + arg4=['-l', 'actor_file=DEBUG']) + + # Construct our node list: + net = NetworkConfig.from_file(self.network_config) + net_conf = net.get_network_config() + all_nodes = len(net_conf) + + active_nodes = 0 + for node in net_conf.values(): + if active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + current_node_count = nnodes + alive_hosts = [node.host_id for node in net_conf.values()] + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_resilient_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend + dropped_index = 2 + dropped_host_id = 0 + + log = logging.getLogger('test') + for index in range(all_nodes - nnodes + 1): + + self.do_bringup(mock_overlay, mock_launch, net_conf=net_conf) + + # Receive GSProcessCreate + log.info('Creating head proc') + handle_gsprocesscreate(self.primary_conn) + log.info('Head proc created') + + # Check that the frontend gave us the expected config + self.assertEqual(len(self.be_nodes), nnodes) + for host_id, node in self.be_nodes.items(): + self.assertNotEqual(host_id, dropped_host_id) + + # if the last execution, grab stdout to make sure the correct message is printed + if index == all_nodes - nnodes: + captured_stdout = StringIO() + sys.stdout = captured_stdout + + # Send an abormal termination rather than proceeding with teardown + if dropped_index > nnodes - 1: + dropped_index = nnodes - 1 + for i, (host_id, node) in enumerate(self.be_nodes.items()): + if i == dropped_index: + dropped_host_id = host_id + log.info(f"sending a abnormal signal to {host_id}: {node}") + send_abnormal_term(node['conn'], host_id=host_id) + + # Necessarily clean up all the backend stuff: + log.info('doing cleanup of mocks') + self.cleanup() + + # Update our internal ref on the node states + active_nodes = 0 + log.info('updating net conf') + for node in net_conf.values(): + if node.state != NodeDescriptor.State.DOWN: + if node.host_id == dropped_host_id: + log.info(f"marking {dropped_host_id} down in net_conf") + node.state = NodeDescriptor.State.DOWN + elif active_nodes != nnodes: + node.state = NodeDescriptor.State.ACTIVE + active_nodes = active_nodes + 1 + else: + node.state = NodeDescriptor.State.IDLE + + # Update the node count if we have fewer than originally proposed + if active_nodes < nnodes: + nnodes = active_nodes + + # Join on the frontend thread + fe_proc.join() + + # Reset stdout and check captured output + sys.stdout = sys.__stdout__ + self.assertTrue("There are not enough hardware resources available for continued app execution" in captured_stdout.getvalue()) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + unittest.main() diff --git a/test/mpbridge/test_pool.py b/test/mpbridge/test_pool.py new file mode 100644 index 0000000..712c0c0 --- /dev/null +++ b/test/mpbridge/test_pool.py @@ -0,0 +1,58 @@ +import unittest + +import dragon +import multiprocessing as mp + +# last val should be (NUM_WORKERS ** INCEPTION_DEPTH - 1) ** 2 +NUM_WORKERS = 2 +INCEPTION_DEPTH = 3 + +def square(x): + return x ** 2 + +def flatten(list_of_lists): + flat = [] + for val in list_of_lists: + flat.extend(val) + return flat + +def square_or_pool(args): + value, depth_val = args + if depth_val > 1: + to_square = [x for x in range(value, value + NUM_WORKERS ** (depth_val-1), NUM_WORKERS ** (depth_val-2))] + depth = [depth_val - 1] * len(to_square) + pool = mp.Pool(NUM_WORKERS) + result = pool.map(square_or_pool, zip(to_square, depth)) + if depth_val > 2: + result = flatten(result) + pool.close() + pool.join() + else: + result = square(value) + + return result + +def setUpModule(): + mp.set_start_method("dragon", force=True) + + +class TestMPBridgePool(unittest.TestCase): + def setUp(self): + self.assertEqual(mp.get_start_method(), "dragon") + + def test_pool_recursion(self): + to_square = [x for x in range(0, NUM_WORKERS ** INCEPTION_DEPTH, NUM_WORKERS ** (INCEPTION_DEPTH - 1))] + depth = [INCEPTION_DEPTH] * len(to_square) + + pool = mp.Pool(NUM_WORKERS) + + squared = pool.map(square_or_pool, zip(to_square, depth)) + squared = flatten(squared) + + pool.close() + pool.join() + self.assertEqual(squared, [square(x) for x in range(NUM_WORKERS ** INCEPTION_DEPTH)]) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/multi-node/Makefile b/test/multi-node/Makefile index 085f31e..69268f9 100644 --- a/test/multi-node/Makefile +++ b/test/multi-node/Makefile @@ -26,7 +26,9 @@ TESTS_DRAGON_MULTI_NODE := test_barrier.py \ test_value.py \ test_array.py \ test_process_group.py \ - test_dict.py + test_ddict.py \ + test_distdict.py \ + test_fli.py TESTS_DRAGON_MULTI_NODE_PMI := test_mpi_hello_world.py diff --git a/test/multi-node/test_ddict.py b/test/multi-node/test_ddict.py new file mode 100644 index 0000000..9bbfd6d --- /dev/null +++ b/test/multi-node/test_ddict.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python3 + +import os +import sys +import string +import random +import dragon +import unittest +import multiprocessing as mp + +from dragon.data.distdictionary.dragon_dict import DragonDict +from dragon.globalservices.node import get_list +from dragon.infrastructure import parameters + +def do_client_ops(ddict, key, value): + """Each client will do the operation of adding the sample key,value pair + into the dictionary and again delete the same pair. + + :param ddict: dragon distributed dictionary + :type ddict: dragon dictionary object + :param key: Information about the SET operation to the manager + :type key: Any hashable structure + :param value: Value to be stored inside the manager + :type value: Any structure that can be serialized + """ + ddict[key] = value + del ddict[key] + # Close the client + ddict.close() + +def generate_keys(): + keys = list() + # Generate the list of 100 keys to add to the dictionary + dict_size = 100 + letters = string.ascii_letters + + for _ in range(dict_size): + # each key is 20 characters long and characters can be repeated + key = ''.join(random.choice(letters) for i in range(20)) + keys.append(key) + return keys + +def do_set_ops(_keys, ddict, value_size): + """Each client will generate the value of given size, and perform + the SET operation to all the keys of the dictionary + + :param _keys: list of all keys to be added to the dictionary + :type _keys: list + :param ddict: dragon distributed dictionary + :type ddict: dragon dictionary object + :param value_size: size of the value in bytes added to each dictionary key + :type value_size: int + """ + num_keys = len(_keys) + letters = string.ascii_letters + value = ''.join(random.choice(letters) for i in range(value_size)) + + # Add each key to the dictionary + for key in _keys: + ddict[key] = value + # Close the client + ddict.close() + +def do_del_ops(_keys, ddict): + """Each client will delete the given list of keys in the dictionary + + :param _keys: list of all keys to be deleted from the dictionary + :type _keys: list + :param ddict: dragon distributed dictionary + :type ddict: dragon dictionary object + """ + for key in _keys: + del ddict[key] + # Close the client + ddict.close() + +def do_get_ops(_keys, ddict): + """Each client will retrieve the values for the given list of keys in the dictionary + + :param _keys: list of all keys to be fetched from the dictionary + :type _keys: list + :param ddict: dragon distributed dictionary + :type ddict: dragon dictionary object + """ + for key in _keys: + value = ddict[key] + # Close the client + ddict.close() + + +class TestDragonDictMultiNode(unittest.TestCase): + @classmethod + def setUpClass(self) -> None: + # Create a dragon dictionary on a single node with multiple manager processes + self.managers_per_node = 1 # 1 Managers per node + self.num_nodes = len(get_list()) # Collect the total number of nodes + self.total_mem_size = 2*(1024*1024*1024) # 2 GB total size + self.DD = DragonDict(self.managers_per_node, self.num_nodes, self.total_mem_size) + + @classmethod + def tearDownClass(self) -> None: + self.DD.stop() + + def test_dict_params(self): + # Verify the dictionary params + total_nodes = len(get_list()) + dict_nodes = self.DD._dist_dict.nodes + managers_per_node = self.DD._dist_dict.managers_per_node + dict_managers = dict_nodes * managers_per_node + self.assertEqual(dict_nodes, total_nodes, "Number of dictionary nodes are incorrect") + self.assertEqual(managers_per_node, 1, "Number of managers per node are incorrect") + self.assertEqual(dict_managers, total_nodes, "Number of dictionary managers are incorrect") + + def test_set_and_get_item(self): + key = "Who are you Dragon?" + value = "A Fire Breathing Monster!" + self.DD[key] = value + # Compare if the fetched values are correct + self.assertEqual(self.DD[key], value, "Retrieved value from the dict is incorrect") + + def test_setup_and_close_client(self): + # Create a client process and pass the dictionary + client_proc = mp.Process(target=do_client_ops, args=(self.DD, "Dragon", "Dictionary")) + client_proc.start() + client_proc.join() + + def test_set_ops(self): + keys = generate_keys() + # Create client procs that does dictionary operations + num_clients = 8 + value_size = 64 + procs = [] + for i in range(num_clients): + client_proc = mp.Process(target=do_set_ops, args=(keys, self.DD, value_size)) + client_proc.start() + procs.append(client_proc) + + for i in range(len(procs)): + procs[i].join() + procs[i].kill() + + def test_dictionary_length(self): + num_kv_pairs = 100 + # Assign key value pairs to the dictionary + for i in range(0, num_kv_pairs): + key = "Hello" + "." + str(i) + value = "Dragon" + "." + str(i) + self.DD[key] = value + + # Calculate the length of the dictionary + dict_length = len(self.DD) + # Verify that length of dictionary is equal to number of kv pairs + self.assertEqual(dict_length, num_kv_pairs, "Calculated length of the dictionary is incorrect") + + def test_set_and_del_item(self): + key = "Hello" + value = "Dictionary" + self.DD[key] = value + # Delete operation should be successful + del self.DD[key] + + def test_del_item_with_no_key(self): + # Delete operation should be successful, without key present also + # Log the information that the key is not present. + letters = string.ascii_letters + key = ''.join(random.choice(letters) for i in range(10)) + del self.DD[key] + + def test_del_ops(self): + keys = list() + num_kv_pairs = 100 + # Collect the set of keys for the dictionary + for i in range(0, num_kv_pairs): + key = "Hello" + "." + str(i) + keys.append(key) + + # Two client processes delete keys in parallel + # This might conduct extra deletes with keys deleted by other proc before + num_clients = 2 + procs = [] + for i in range(num_clients): + client_proc = mp.Process(target=do_del_ops, args=(keys, self.DD)) + client_proc.start() + procs.append(client_proc) + + for i in range(len(procs)): + procs[i].join() + procs[i].kill() + + def test_keys(self): + # Collect the keys from the dictionary + keys = self.DD.keys() + num_keys = len(keys) + dict_length = len(self.DD) + # Verify that length of dictionary is equal to number of kv pairs + self.assertEqual(dict_length, num_keys, "Calculated length of the dictionary is incorrect") + + def test_get_ops(self): + keys = self.DD.keys() + # Create client procs that does dictionary operations + num_clients = 8 + procs = [] + for i in range(num_clients): + client_proc = mp.Process(target=do_get_ops, args=(keys, self.DD)) + client_proc.start() + procs.append(client_proc) + + for i in range(len(procs)): + procs[i].join() + procs[i].kill() + + def test_existing_key(self): + key = "Testing" + value = "Key" + self.DD[key] = value + key_found = False + # Verify if the key is present in the dictionary + if key in self.DD: + key_found = True + self.assertEqual(key_found, True, "Key has not found in the dictionary") + + def test_not_existing_key(self): + key = "Test" + key_found = False + # Verify if the key is present in the dictionary + if key in self.DD: + key_found = True + self.assertEqual(key_found, False, "Key found in the dictionary") + + def test_pop_item(self): + key = "Pop" + value = "Item" + self.DD[key] = value + # Collect the value by popping the key + pop_value = self.DD.pop(key) + # Verify that the pop operation is correct + self.assertEqual(value, pop_value, "Pop value is not returned correctly") + + def test_existing_value(self): + key = "Testing" + value = "Value" + self.DD[key] = value + value_found = False + # Verify if the value is existing in the dictionary + if value in self.DD.values(): + value_found = True + self.assertEqual(value_found, True, "Value has not found in the dictionary") + + def test_not_existing_value(self): + value = "Dummy Value" + value_found = False + # Verify if the value is existing in the dictionary + if value in self.DD.values(): + value_found = True + self.assertEqual(value_found, False, "Value found in the dictionary") + + def test_items(self): + items = self.DD.items() + for (key, value) in items: + ret_value = self.DD[key] + # Verify the value retrieved from the items with the specific key + self.assertEqual(value, ret_value, "Value collected from the items is incorrect") + + def test_rename(self): + key = "Hello" + value = "Dictionary" + self.DD[key] = value + # Rename the above key + new_key = "Renamed" + self.DD.rename(key, new_key) + ret_value = self.DD[new_key] + # Verify that the key is properly renamed + self.assertEqual(value, ret_value, "Rename key is not successful") + + +class TestDragonDictMultiNodeStress(unittest.TestCase): + @classmethod + def increment_values(self, client_id, ev, ddict, locks): + """Each client will increment the value in all keys of the dictionary. + A corresponding lock will be taken for each key while incrementing the value. + + :param client_id: id of the client + :type client_id: int + :param ev: event for the client + :type ev: multiprocessing event object + :type ddict: dragon dictionary object + :param ddict: dragon distributed dictionary + :type locks: dict of locks of multiprocessing value object for each key + :type locks: dict + """ + # Add key to the dictionary, and wait for the event + ddict[client_id] = int(0) + ev.wait() + + keys = ddict.keys() + random.shuffle(keys) + for key in keys: + # Acquire the lock for the key and increment the associated value + success = locks[key].acquire(block=True, timeout=None) + assert success == True, "Could not acquire lock" + value = int(ddict[key]) + value += 1 + ddict[key] = int(value) + locks[key].release() + + # Close the client + ddict.close() + + @classmethod + def do_dict_workload(self, num_clients, ddict): + """Test the workload on the dictionary, with each client incrementing the values + in each key of the dictionary. A corresponding lock will be taken for each key + while incrementing the value for synchronization purposes. Each key will have a + multiprocessing value object to be acquired before incrementing the value. + + :param num_clients: total number of clients + :type num_clients: int + :type ddict: dragon dictionary object + :param ddict: dragon distributed dictionary + """ + procs = [] + locks = [] + event = mp.Event() + for i in range(num_clients): + locks.append(mp.Lock()) + + for i in range(num_clients): + client_proc = mp.Process(target=self.increment_values, args=(i, event, ddict, locks,)) + client_proc.start() + procs.append(client_proc) + + wait = True + while wait: + if(len(ddict) == num_clients): + wait = False + + # Wait is done + event.set() + for i in range(len(procs)): + procs[i].join() + + # Clean all the processes + for i in range(len(procs)): + procs[i].kill() + + @unittest.skip('CIRRUS-1820: Hanging') + def test_dict_with_stress_load(self): + managers_per_node = 1 # 2 Managers per node + num_nodes = len(get_list()) # Collect the total number of nodes + total_mem_size = 4*(1024*1024*1024) # 4 GB total size + ddict = DragonDict(managers_per_node, num_nodes, total_mem_size) + + # Being conservative, increase to the max cpu_count later + num_clients = max(2, mp.cpu_count() // 4) + self.do_dict_workload(num_clients, ddict) + + # Verify the values are updated properly + # Each value should be equal to the num_clients + values = ddict.values() + for value in values: + self.assertEqual(value, num_clients, "Value is not equal to the number of clients in the dictionary") + ddict.stop() + + +if __name__ == "__main__": + mp.set_start_method("dragon") + + # Disable the non-deterministic behavior of the python hashing algorithm behavior + hashseed = os.getenv('PYTHONHASHSEED') + if not hashseed: + os.environ['PYTHONHASHSEED'] = '0' + os.execv(sys.executable, [sys.executable] + sys.argv) + unittest.main() diff --git a/test/multi-node/test_distdict.py b/test/multi-node/test_distdict.py new file mode 100644 index 0000000..1f10452 --- /dev/null +++ b/test/multi-node/test_distdict.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 + +import unittest + +import dragon.infrastructure.messages as dmsg +import dragon.channels as dch +from dragon.utils import b64encode, b64decode +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.node import get_list +import multiprocessing as mp +from dragon.rc import DragonError + +def register_and_detach(d): + d.detach() + +def set_ops(d, client_id): + key1 = 'hello' + str(client_id) + d[key1] = 'world' + str(client_id) + d.detach() + +def get_ops(d, client_id): + key1 = 'hello' + str(client_id) + assert d[key1] == 'world' + str(client_id) + d.detach() + +def del_ops(d, client_id): + key1 = 'hello' + str(client_id) + del d[key1] + d.detach() + +def contains_ops(d, client_id): + key1 = 'hello' + str(client_id) + assert key1 in d + d.detach() + +class TestDDict(unittest.TestCase): + @classmethod + def setUpClass(self) -> None: + # Create a dragon dictionary on a single node with multiple manager processes + self._managers_per_node = 1 # 1 Managers per node + self._num_nodes = len(get_list()) # Collect the total number of nodes + self._total_mem_size = self._num_nodes*(1024*1024*1024) # 1 GB for each node + self._num_clients = self._num_nodes * 2 # 2 clients per node + + def test_local_channel(self): + ch = dch.Channel.make_process_local() + ch.detach() + + def test_infra_message(self): + msg = dmsg.GSHalted(42) + ser = msg.serialize() + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.GSHalted) + newser = 'eJyrVoovSVayUjA21lFQKklMBzItawE+xQWS' + from_str = dmsg.parse(newser) + self.assertIsInstance(from_str, dmsg.GSHalted) + newser = 'eJyrVoovSVayUjA21lFQKklMBzItawE+xQWS\n' + from_str = dmsg.parse(newser) + self.assertIsInstance(from_str, dmsg.GSHalted) + newline = b'\n\n\n\n' + encoded = b64encode(newline) + decoded = b64decode(encoded) + self.assertEqual(newline, decoded) + newline = '\n\n\n\n' + encoded = b64encode(newline.encode('utf-8')) + decoded = b64decode(encoded) + self.assertEqual(newline, decoded.decode('utf-8')) + + def test_capnp_message (self): + msg = dmsg.DDRegisterClient(42, "HelloWorld", "Dragon") + ser = msg.serialize() + + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.DDRegisterClient) + + def test_ddict_client_response_message(self): + msg = dmsg.DDRegisterClientResponse(42, 43, DragonError.SUCCESS, 0, 2, 'this is dragon error info') + ser = msg.serialize() + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.DDRegisterClientResponse) + + def test_bringup_teardown(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + d.destroy() + + def test_detach_client(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + for i in range(self._num_clients): + client_proc = mp.Process(target=register_and_detach, kwargs={"d": d}) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for i in range(self._num_clients): + procs[i].terminate() + + d.destroy() + + def test_set(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for i in range(self._num_clients): + procs[i].terminate() + + for i in range(self._num_clients): + self.assertTrue('hello' + str(i) in d) + + d.destroy() + + def test_get(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + + # put a bunch of key-value pairs to dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for i in range(self._num_clients): + procs[i].terminate() + + procs = [] + # get key-value pairs from dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=get_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for i in range(self._num_clients): + procs[i].terminate() + + d.destroy() + + def test_pop(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + + # put a bunch of key-value pairs to dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for i in range(self._num_clients): + procs[i].terminate() + + procs = [] + # delete key-value pairs from dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=del_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for i in range(self._num_clients): + procs[i].terminate() + + for i in range(self._num_clients): + self.assertFalse('hello' + str(i) in d) + + d.destroy() + + def test_contains_key(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + + # put a bunch of key-value pairs to dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + # test contains key + procs = [] + for i in range(self._num_clients): + client_proc = mp.Process(target=contains_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for i in range(self._num_clients): + procs[i].terminate() + + d.destroy() + + def test_len(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + self.assertEqual(len(d), 0) + procs = [] + + # put a bunch of key-value pairs to dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + self.assertEqual(len(d), self._num_clients) + d.destroy() + + def test_clear(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + + # put a bunch of key-value pairs to dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + self.assertEqual(len(d), self._num_clients) + d.clear() + self.assertEqual(len(d), 0) + d.destroy() + + + @unittest.skip('Not yet implemented') + def test_iter(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + + # put a bunch of key-value pairs to dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + for key in d: + num = key[5:] + self.assertEqual(d[key], 'world'+num) + d.destroy() + + def test_keys(self): + d = DDict(self._managers_per_node, self._num_nodes, self._total_mem_size) + procs = [] + + # put a bunch of key-value pairs to dictionary + for i in range(self._num_clients): + client_proc = mp.Process(target=set_ops, args=(d, i)) + client_proc.start() + procs.append(client_proc) + + for i in range(self._num_clients): + procs[i].join() + + ddict_keys = d.keys() + for key in ddict_keys: + num = key[5:] + self.assertEqual(d[key], 'world'+num) + + d.destroy() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/multi-node/test_fli.py b/test/multi-node/test_fli.py new file mode 100644 index 0000000..6b3d2c0 --- /dev/null +++ b/test/multi-node/test_fli.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 + +import unittest +import os +import dragon +import multiprocessing as mp +from dragon.fli import FLInterface, DragonFLIError, FLIEOT +from dragon.managed_memory import MemoryPool, MemoryAlloc +from dragon.channels import Channel +from dragon.localservices.options import ChannelOptions +from dragon.infrastructure import facts as dfacts +from dragon.infrastructure import parameters +from dragon.utils import B64 +import socket + +class Next: + def __init__(self, initial_value): + self._value = initial_value + + @property + def next(self): + val = self._value + self._value += 1 + return val + +def echo(fli_in, fli_out, main_host): + host = socket.gethostname() + # print(f'echo running on {host}', flush=True) + # print('Made it into echo', flush=True) + recvh = fli_in.recvh() + sendh = fli_out.sendh() + # print("Getting bytes in echo", flush=True) + x, hint = recvh.recv_bytes() + # print("Got bytes in echo", flush=True) + + + try: + # print("Getting EOT in echo", flush=True) + recvh.recv_bytes() + # print(f'ERROR in test: Did not get EOFError. Data was {x} and hint was {hint} in echo', flush=True) + sendh.send_bytes(b'Error: EOF was not raised in echo', hint+1) + except EOFError: + # print("Got EOT in echo", flush=True) + if host == main_host: + sendh.send_bytes(b'The host for echo and host for main were the same and so is not testing cross node communication.', hint) + else: + sendh.send_bytes(x, hint) + recvh.close() + + sendh.close() + # print("Echo exiting", flush=True) + +def nothing(): + # print(f'Doing nothing on node {socket.gethostname()}', flush=True) + pass + +def test_main(fli1, fli2, channel_host, same): + host = socket.gethostname() + # print(f'test_main running on {host}', flush=True) + + # For some reason, on a two node allocation, the first process started + # does not run on a different node. If this test_main is already run on + # a different node, then it will not need the dummy process created. + if same: + dummy = mp.Process(target=nothing, args=()) + dummy.start() + proc = mp.Process(target=echo, args=(fli1, fli2, host)) + proc.start() + # print("And here", flush=True) + + sendh = fli1.sendh() + recvh = fli2.recvh() + + # print('Sending bytes', flush=True) + sendh.send_bytes(b'hello', 42) + sendh.close() + x, hint = recvh.recv_bytes() + + try: + # print("Getting EOT in main", flush=True) + x, hint = recvh.recv_bytes() + # print(f'ERROR in main: Did not get EOFError. Got {x=} and {hint=}', flush=True) + except EOFError: + # print("Got EOT in main", flush=True) + recvh.close() + + proc.join() + if same: + dummy.join() + + sendh = fli1.sendh() + + if (same and host == channel_host) or (not same and host != channel_host): + sendh.send_bytes(x) + sendh.send_bytes(hint.to_bytes(8, byteorder='little')) + else: + msg = f'Required {same=} and {host} == {channel_host} is not {same}.' + sendh.send_bytes(bytes(msg, 'utf-8')) + sendh.send_bytes(b'0') + + sendh.close() + +class FLITest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + mp.set_start_method('dragon') + cls.host_name = socket.gethostname() + # print(f'Created channels on node {cls.host_name}', flush=True) + cls.default_muid = dfacts.default_pool_muid_from_index(parameters.this_process.index) + cls._buffer_pool = MemoryPool.attach(B64.str_to_bytes(parameters.this_process.default_pd)) + + cls.cuids = Next(dfacts.FIRST_CUID + 1000) + cls.main_ch = Channel(cls._buffer_pool, cls.cuids.next) + cls.mgr_ch = Channel(cls._buffer_pool, cls.cuids.next) + cls.strm_chs = [] + + for i in range(5): + cls.strm_chs.append(Channel(cls._buffer_pool, cls.cuids.next)) + + cls.main_ch2 = Channel(cls._buffer_pool, cls.cuids.next) + cls.mgr_ch2 = Channel(cls._buffer_pool, cls.cuids.next) + cls.strm_chs2 = [] + + for i in range(5): + cls.strm_chs2.append(Channel(cls._buffer_pool, cls.cuids.next)) + + cls.fli1 = FLInterface(main_ch=cls.main_ch, manager_ch=cls.mgr_ch, stream_channels=cls.strm_chs) + cls.fli2 = FLInterface(main_ch=cls.main_ch2, manager_ch=cls.mgr_ch2, stream_channels=cls.strm_chs2) + + @classmethod + def tearDownClass(cls): + cls.main_ch.destroy() + for i in range(5): + cls.strm_chs[i].destroy() + + cls.main_ch2.destroy() + for i in range(5): + cls.strm_chs2[i].destroy() + + def test_hints(self): + test_main(self.fli1, self.fli2, self.host_name, True) + recvh = self.fli1.recvh() + x, _ = recvh.recv_bytes() + hint_bytes, _ = recvh.recv_bytes() + hint = int.from_bytes(hint_bytes, byteorder='little') + # The following code should work with assertRaises, but does not. + # Have no idea why. + try: + recvh.recv_bytes() + self.assertTrue(False, "The recv_bytes should have raised EOFError") + except EOFError: + pass + recvh.close() + self.assertEqual(x,b'hello') + self.assertEqual(hint, 42) + # print('Completed test_hints test!', flush=True) + + def test_hints_reverse(self): + # For some reason, on a two node allocation, the first process started + # does not run on a different node. If this test_main is to be run on a + # separate node from where the channels were created, then we must + # start a dummy process first. + dummy = mp.Process(target=nothing, args=()) + dummy.start() + # print(f'HOST_NAME IS {self.host_name}') + main_proc = mp.Process(target=test_main, args=(self.fli1, self.fli2, self.host_name, False)) + main_proc.start() + main_proc.join() + recvh = self.fli1.recvh() + x, _ = recvh.recv_bytes() + hint_bytes, _ = recvh.recv_bytes() + hint = int.from_bytes(hint_bytes, byteorder='little') + # The following code should work with assertRaises, but does not. + # Have no idea why. + try: + recvh.recv_bytes() + self.assertTrue(False, "The recv_bytes should have raised EOFError") + except EOFError: + pass + + recvh.close() + self.assertEqual(x,b'hello') + self.assertEqual(hint, 42) + # print('Completed test_hints_reverse test!', flush=True) + dummy.join() + + +if __name__ == '__main__': + unittest.main() diff --git a/test/multi-node/test_process.py b/test/multi-node/test_process.py index 7b8ecef..7d49efd 100644 --- a/test/multi-node/test_process.py +++ b/test/multi-node/test_process.py @@ -7,11 +7,15 @@ import unittest import time +import socket +import os import dragon import multiprocessing as mp from dragon.globalservices.process import query, multi_join, this_process - +from dragon.native.machine import cpu_count, current, System, Node +from dragon.native.process import Process +from dragon.infrastructure.policy import Policy def inception(nnew: int, q: mp.Queue, ev1: mp.Event, ev2: mp.Event, sem: mp.Semaphore) -> None: @@ -28,6 +32,17 @@ def inception(nnew: int, q: mp.Queue, ev1: mp.Event, ev2: mp.Event, sem: mp.Sema ev2.wait(timeout=None) +def placement_gpu_info(sleep_time, q, vendor=None): + hostname = socket.gethostname() + if vendor is not None: + if vendor == 'Nvidia': + visible_devices=int(os.getenv("CUDA_VISIBLE_DEVICES")) + elif vendor == 'AMD': + visible_devices=int(os.getenv("ROCR_VISIBLE_DEVICES")) + else: + visible_devices=None + # this sleep is important until Process Group holds a history of puids + q.put((hostname, visible_devices,)) class TestProcessMultiNode(unittest.TestCase): def test_inception(self) -> None: @@ -80,7 +95,26 @@ def test_inception(self) -> None: for p in processes: self.assertTrue(p.exitcode == 0) - + + def test_policy(self) -> None: + my_alloc = System() + node_list = my_alloc.nodes + node = Node(node_list[-1]) + q = mp.Queue() + cwd = os.getcwd() + if node.gpu_vendor is None: + args = (5,q,) + policy = Policy(placement=Policy.Placement.HOST_NAME,host_name=node.hostname) + else: + args = (5,q,node.gpu_vendor,) + policy = Policy(placement=Policy.Placement.HOST_NAME, host_name=node.hostname, device=Policy.Device.GPU, gpu_affinity=[node.gpus[-1]]) + #using native process to take template + proc = Process(target=placement_gpu_info, args=args, policy=policy) + proc.start() + hostname, gpu_affinity = q.get() + self.assertEqual(hostname, node.hostname) + if node.gpu_vendor is not None: + self.assertEqual(gpu_affinity, node.gpus[-1]) if __name__ == "__main__": mp.set_start_method("dragon") diff --git a/test/multi-node/test_process_group.py b/test/multi-node/test_process_group.py index 228d93b..b5c4ca0 100644 --- a/test/multi-node/test_process_group.py +++ b/test/multi-node/test_process_group.py @@ -5,18 +5,37 @@ import time import random import signal +import socket import dragon from dragon.native.process_group import ProcessGroup -from dragon.native.process import TemplateProcess, Process -from dragon.native.machine import cpu_count, current +from dragon.native.process import ProcessTemplate, Process +from dragon.native.queue import Queue +from dragon.native.machine import cpu_count, current, System, Node from dragon.globalservices.process import query as process_query, kill, signal as dragon_signal from dragon.globalservices.node import query as node_query, get_list as node_get_list from dragon.infrastructure.process_desc import ProcessOptions -from dragon.infrastructure.policy import Policy - +from dragon.infrastructure.policy import Policy, GS_DEFAULT_POLICY + +def placement_info(q, vendor): + hostname = socket.gethostname() + pid = os.getpid() + cpus_allowed_list = -1 + with open(f'/proc/{pid}/status') as f: + for _, line in enumerate(f): + split_line = line.split(':') + if split_line[0] == "Cpus_allowed_list": + cpus_allowed_list = split_line[1].strip('\n').strip('\t') + break + visible_devices=None + if vendor == 'Nvidia': + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + elif vendor == 'AMD': + visible_devices = os.getenv("ROCR_VISIBLE_DEVICES") + + q.put((hostname, cpus_allowed_list, visible_devices,)) class TestProcessGroupMultiNode(unittest.TestCase): @classmethod @@ -29,9 +48,9 @@ def setUpClass(cls): cls.args = ("1000",) cls.cwd = os.getcwd() cls.options = ProcessOptions(make_inf_channels=True) - cls.template = TemplateProcess(cls.cmd, args=cls.args, cwd=cls.cwd) + cls.template = ProcessTemplate(cls.cmd, args=cls.args, cwd=cls.cwd) - @unittest.skip(f"CIRRUS-1163: Will fail until process descriptor contains a h_uid/host_id.") + @unittest.skip("CIRRUS-1163: Will fail until process descriptor contains a h_uid/host_id.") def test_placement_roundrobin(self): pg = ProcessGroup(restart=False) @@ -76,12 +95,12 @@ def test_maintain_stress(self): for i in range(self.nproc): if i % 10 == 0: args = (f"{testtime/self.nproc * (i+1)}",) - t = TemplateProcess("sleep", args=args, cwd=self.cwd) + t = ProcessTemplate("sleep", args=args, cwd=self.cwd) pg.add_process(1, t) else: count += 1 args = ("10000000",) - t = TemplateProcess("sleep", args=args, cwd=self.cwd) + t = ProcessTemplate("sleep", args=args, cwd=self.cwd) pg.add_process(count, t) pg.init() @@ -143,29 +162,252 @@ def test_walltime(self): start = time.monotonic() while not pg.status == "Idle": self.assertFalse(pg.status == "Error") - time.sleep(0.5) stop = time.monotonic() self.assertAlmostEqual(stop - start, wtime, 0) pg.stop() + + def test_hostname_node_restriction(self): + my_alloc = System() + num_procs_per_node = 2 + num_nodes_to_use = int(my_alloc.nnodes()/2) + node_list = my_alloc.nodes + num_procs = num_nodes_to_use*num_procs_per_node + q = Queue() + gpu_vendor = None + args = (q, gpu_vendor) + cwd = os.getcwd() + grp = ProcessGroup(restart=False) + acceptable_hostnames = [] + + # create a process group that runs on a subset of nodes + for node_num in range(num_nodes_to_use): + node_name = Node(node_list[node_num]).hostname + local_policy = Policy(placement=Policy.Placement.HOST_NAME,host_name=node_name) + grp.add_process(nproc=num_procs_per_node, template=ProcessTemplate(target=placement_info, args=args, cwd=cwd, policy=local_policy)) + acceptable_hostnames.append(node_name) + + #init and start my process group + grp.init() + grp.start() - @unittest.skip(f"CIRRUS-1831: Will fail until PG api is fixed. The TODO comment should also be addressed.") - def test_node_id(self): - pg = ProcessGroup(self.template, 4) - pg.start() + count = 0 + while count < num_procs: + hostname, _, _ = q.get() + # check that proc is on a node it was meant to land on + self.assertIn(hostname, acceptable_hostnames, msg=f'Got hostname {hostname} which is not in {acceptable_hostnames}') + count += 1 + + # wait for workers to finish and shutdown process group + grp.join() + grp.stop() + + def test_huid_node_restriction(self): + my_alloc = System() + num_procs_per_node = 2 + num_nodes_to_use = int(my_alloc.nnodes()/2) + node_list = my_alloc.nodes + num_procs = num_nodes_to_use*num_procs_per_node + q = Queue() + gpu_vendor = None + args = (q, gpu_vendor) + cwd = os.getcwd() + grp = ProcessGroup(restart=False) + acceptable_huids = [] + # create a process group that runs on a subset of nodes + for index, huid in enumerate(node_list[:num_nodes_to_use]): + node_huid = Node(node_list[index]).h_uid + self.assertEqual(huid, node_huid, f'{huid} is not equal to {node_huid} from Node.h_uid') + local_policy = Policy(placement=Policy.Placement.HOST_ID,host_id=huid) + grp.add_process(nproc=num_procs_per_node, template=ProcessTemplate(target=placement_info, args=args, cwd=cwd, policy=local_policy)) + acceptable_huids.append(huid) + #init and start my process group + grp.init() + grp.start() - nodes = node_get_list() - puids = pg.puids - for puid in puids: - gs_info = process_query(puid) - print(gs_info) - # TODO: Increment counter for each node, assert we've got equal distribution of procs + count = 0 + while count < num_procs: + hostname, _, _ = q.get() + host_id = Node(hostname).h_uid + # check that proc is on a node it was meant to land on + self.assertIn(host_id, acceptable_huids, msg=f'Got hostname {host_id} which is not in {acceptable_huids}') + count += 1 + grp.join() + grp.stop() + + + def test_policy_hierarchy(self): + my_alloc = System() + num_procs_per_node = 2 + num_nodes_to_use = int(my_alloc.nnodes()/2) + node_list = my_alloc.nodes + num_procs = num_nodes_to_use*num_procs_per_node + q = Queue() + gpu_vendor = None + args = (q, gpu_vendor) + cwd = os.getcwd() + global_policy = Policy(placement=Policy.Placement.HOST_NAME,host_name=Node(node_list[num_nodes_to_use]).hostname) + grp = ProcessGroup(restart=False, policy=global_policy) + acceptable_hostnames = [] + + # create a process group that runs on a subset of nodes + for node_num in range(num_nodes_to_use): + node_name = Node(node_list[node_num]).hostname + local_policy = Policy(placement=Policy.Placement.HOST_NAME,host_name=node_name) + grp.add_process(nproc=num_procs_per_node, template=ProcessTemplate(target=placement_info, args=args, cwd=cwd, policy=local_policy)) + acceptable_hostnames.append(node_name) + + #init and start my process group + grp.init() + grp.start() - pg.stop() + count = 0 + while count < num_procs: + hostname, _, _ = q.get() + # check that proc is on a node it was meant to land on + self.assertIn(hostname, acceptable_hostnames, msg=f'Got hostname {hostname} which is not in {acceptable_hostnames}') + count += 1 + + # wait for workers to finish and shutdown process group + grp.join() + grp.stop() + + def test_block_distribution(self): + my_alloc = System() + num_procs = int(cpu_count()/my_alloc.nnodes()) + q = Queue() + gpu_vendor = None + args = (q, gpu_vendor) + cwd = os.getcwd() + global_policy = Policy(distribution=Policy.Distribution.BLOCK) + grp = ProcessGroup(restart=False, policy=global_policy) + grp.add_process(nproc=num_procs, template=ProcessTemplate(target=placement_info, args=args, cwd=cwd)) + + #init and start my process group + grp.init() + grp.start() + + count = 0 + while count < num_procs: + hostname, _, _ = q.get() + if count > 0: + # check that proc is on a node it was meant to land on + self.assertEqual(hostname, acceptable_hostname, msg=f'Got hostname {hostname} which is not in {acceptable_hostname}') + else: + acceptable_hostname=hostname + count += 1 + # wait for workers to finish and shutdown process group + grp.join() + grp.stop() + + + def test_roundrobin_distribution(self): + my_alloc = System() + num_procs_per_node = 10 + num_procs = int(num_procs_per_node*my_alloc.nnodes()) + q = Queue() + gpu_vendor = None + args = (q, gpu_vendor) + cwd = os.getcwd() + global_policy = Policy(distribution=Policy.Distribution.ROUNDROBIN) + grp = ProcessGroup(restart=False, policy=global_policy) + grp.add_process(nproc=num_procs, template=ProcessTemplate(target=placement_info, args=args, cwd=cwd)) + + #init and start my process group + grp.init() + grp.start() + + host_num_procs = {} + count = 0 + while count < num_procs: + hostname, _, _ = q.get() + try: + host_num_procs[hostname] += 1 + except KeyError: + host_num_procs[hostname] = 1 + count += 1 + for val in host_num_procs.values(): + self.assertEqual(val, num_procs_per_node) + + # wait for workers to finish and shutdown process group + grp.join() + grp.stop() + + def test_gpu_affinity(self): + #there are a couple spots where we assume if a node has gpus that all do + my_alloc = System() + node_list = my_alloc.nodes + nodes = {} + has_gpus=False + for node_id in node_list: + node = Node(node_id) + nodes[node.hostname] = node + if node.gpus is not None: + has_gpus=True + + if not has_gpus: + print('System does not have GPUs to test gpu affinity with', flush=True) + return + + a_node = list(nodes.values())[0] + num_use_devices = int(a_node.num_gpus/2) + devices_to_use = a_node.gpus[num_use_devices:] + correct_env_var ="" + for i in devices_to_use: + correct_env_var +=str(i)+',' + correct_env_var = correct_env_var.strip(',') + + self.assertIn(a_node.gpu_vendor, ['Nvidia', 'AMD']) + q = Queue() + args = (q, a_node.gpu_vendor) + cwd = os.getcwd() + global_policy = Policy(device=Policy.Device.GPU, gpu_affinity=devices_to_use) + grp = ProcessGroup(restart=False, policy=global_policy) + grp.add_process(nproc=my_alloc.nnodes(), template=ProcessTemplate(target=placement_info, args=args, cwd=cwd)) + + #init and start my process group + grp.init() + grp.start() + count = 0 + while count < my_alloc.nnodes(): + _, _, visible_devices = q.get() + self.assertEqual(visible_devices, correct_env_var) + count += 1 + + # wait for workers to finish and shutdown process group + grp.join() + grp.stop() + + + #@unittest.skip(f"Need to work out how to check cpu affinity") + def test_cpu_affinity(self): + my_alloc = System() + q = Queue() + gpu_vendor = None + args = (q, gpu_vendor) + cwd = os.getcwd() + allowed_cpus = [3, 8, 11, 13] + group_policy = Policy(cpu_affinity=allowed_cpus) + grp = ProcessGroup(restart=False, policy=group_policy) + grp.add_process(nproc=my_alloc.nnodes(), template=ProcessTemplate(target=placement_info, args=args, cwd=cwd)) + + #init and start my process group + grp.init() + grp.start() + count = 0 + while count < my_alloc.nnodes(): + _, cpus_list_allowed, _ = q.get() + self.assertEqual(allowed_cpus, [int(x) for x in cpus_list_allowed.split(',')]) + count += 1 + + # wait for workers to finish and shutdown process group + grp.join() + grp.stop() + if __name__ == "__main__": unittest.main() diff --git a/test/native/Makefile b/test/native/Makefile new file mode 100644 index 0000000..d033dc1 --- /dev/null +++ b/test/native/Makefile @@ -0,0 +1,23 @@ +CC ?= gcc +CFLAGS ?= -fPIC -Wall -Ofast -fomit-frame-pointer +#CFLAGS ?= -g -O0 -DDRAGON_DEBUG -Wall +INCLUDE = -I $(DRAGON_INCLUDE_DIR) +LIBS = -L $(DRAGON_LIB_DIR) + +BIN_FILES = flimsgfrom flimsgto + +%.c.o: %.c + $(CC) $(INCLUDE) $(CFLAGS) -c $< -o $@ + +default: build + +build: flimsgfrom flimsgto + +flimsgfrom: flimsgfrom.cpp + g++ $(INCLUDE) -std=c++14 -o flimsgfrom $< $(LIBS) -ldragon -ldl + +flimsgto: flimsgto.cpp + g++ $(INCLUDE) -std=c++14 -o flimsgto $< $(LIBS) -ldragon -ldl + +clean: + rm -rf *.o $(BIN_FILES) core __pycache__ diff --git a/test/native/flimsgfrom.cpp b/test/native/flimsgfrom.cpp new file mode 100644 index 0000000..bde8ca6 --- /dev/null +++ b/test/native/flimsgfrom.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char* argv[]) { + try { + dragonError_t err; + dragonFLIDescr_t fli; + dragonFLISendHandleDescr_t sendh; + dragonFLISerial_t ser_fli; + + if (argc != 2) { + cout << "FAILED: This program expects a serialized fli as its command-line argument." << endl; + return -1; + } + + const char* serFLIb64 = argv[1]; + ser_fli.data = dragon_base64_decode(serFLIb64, &ser_fli.len); + + err = dragon_fli_attach(&ser_fli, NULL, &fli); + if (err != DRAGON_SUCCESS) { + cout << "FAILED to attach to FLI" << endl; + cerr << "FAILED to attach to FLI" << endl; + return -1; + } + + err = dragon_fli_open_send_handle(&fli, &sendh, NULL, NULL); + if (err != DRAGON_SUCCESS) { + cout << "Failed to open send handle" << endl; + cerr << "Failed to open send handle" << endl; + return -1; + } + + DDRegisterClientMsg msg(42, "Hello World", "Dragon is the best"); + + msg.send(&sendh, NULL); + + cout << "OK" << endl; + return 0; + } catch (...) { + cerr << "Exception in code." << endl << flush; + cout << "Exception in code." << endl << flush; + } + + return -1; +} \ No newline at end of file diff --git a/test/native/flimsgfrom.py b/test/native/flimsgfrom.py new file mode 100644 index 0000000..317015c --- /dev/null +++ b/test/native/flimsgfrom.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import unittest +import os +import dragon +import sys +import multiprocessing as mp +from dragon.fli import FLInterface, DragonFLIError, FLIEOT +from dragon.managed_memory import MemoryPool, MemoryAlloc +from dragon.channels import Channel +from dragon.globalservices import channel +from dragon.localservices.options import ChannelOptions +from dragon.native.process import Popen +import dragon.infrastructure.messages as dmsg +import dragon.infrastructure.facts as facts +import dragon.infrastructure.parameters as parameters +from dragon.utils import b64decode, b64encode + +def main(): + try: + ser_fli = sys.argv[1] + decoded_ser_fli = b64decode(ser_fli) + fli = FLInterface.attach(decoded_ser_fli) + msg = dmsg.DDRegisterClient(42, 'Hello World', 'Dragon is the best') + sendh = fli.sendh() + sendh.send_bytes(msg.serialize()) + sendh.close() + print('OK') + except Exception as ex: + print(f'Got Exception in flimsgfrom.py {ex}', flush=True, file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/test/native/flimsgto.cpp b/test/native/flimsgto.cpp new file mode 100644 index 0000000..c23978f --- /dev/null +++ b/test/native/flimsgto.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char* argv[]) { + try { + dragonError_t err; + dragonFLIDescr_t fli; + dragonFLIRecvHandleDescr_t recvh; + dragonFLISerial_t ser_fli; + DragonMsg* msg; + + if (argc != 2) { + cout << "FAILED: This program expects a serialized fli as its command-line argument." << endl; + return -1; + } + + const char* serFLIb64 = argv[1]; + ser_fli.data = dragon_base64_decode(serFLIb64, &ser_fli.len); + + err = dragon_fli_attach(&ser_fli, NULL, &fli); + if (err != DRAGON_SUCCESS) { + cout << "FAILED to attach to FLI" << endl; + cerr << "FAILED to attach to FLI" << endl; + return -1; + } + + err = dragon_fli_open_recv_handle(&fli, &recvh, NULL, NULL); + if (err != DRAGON_SUCCESS) { + cout << "Failed to open recv handle" << endl; + cerr << "Failed to open recv handle" << endl; + return -1; + } + + err = recv_fli_msg(&recvh, &msg, NULL); + if (err != DRAGON_SUCCESS) { + cout << "Failed to recv msg" << endl; + cerr << "Failed to recv msg" << endl; + return -1; + } + + if (msg->tc() != DD_REGISTER_CLIENT) { + cout << "Failed to match typecode." << endl; + cerr << "Failed to match typecode." << endl; + return -1; + } + + DDRegisterClientMsg* rc_msg = (DDRegisterClientMsg*) msg; + + if (strcmp(rc_msg->respFLI(), "Hello World!")) { + cout << "Failed to find expected string in message" << endl; + cerr << "Failed to find expected string in message" << endl; + return -1; + } + + cout << "OK" << endl; + return 0; + } catch (...) { + cerr << "Exception in code." << endl << flush; + cout << "Exception in code." << endl << flush; + } + + return -1; +} \ No newline at end of file diff --git a/test/native/test_ddict.py b/test/native/test_ddict.py new file mode 100644 index 0000000..e06ee29 --- /dev/null +++ b/test/native/test_ddict.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 + +import unittest +import pickle +import sys +import ctypes +import random +import string +import zlib +import json +import time +import os + +import dragon +import dragon.infrastructure.messages as dmsg +import dragon.channels as dch +from dragon.utils import b64encode, b64decode +from dragon.data.ddict.ddict import DDict, DDictError, DDictManagerFull +import multiprocessing as mp +import traceback +from dragon.rc import DragonError + +def fillit(d): + i = 0 + key = "abc" + while True: + d[key] = key + i+=1 + key += "abc"*i + +class TestDDict(unittest.TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_local_channel(self): + ch = dch.Channel.make_process_local() + ch.detach() + + def test_infra_message(self): + msg = dmsg.GSHalted(42) + ser = msg.serialize() + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.GSHalted) + newser = 'eJyrVoovSVayUjA21lFQKklMBzItawE+xQWS' + from_str = dmsg.parse(newser) + self.assertIsInstance(from_str, dmsg.GSHalted) + newser = 'eJyrVoovSVayUjA21lFQKklMBzItawE+xQWS\n' + from_str = dmsg.parse(newser) + self.assertIsInstance(from_str, dmsg.GSHalted) + newline = b'\n\n\n\n' + encoded = b64encode(newline) + decoded = b64decode(encoded) + self.assertEqual(newline, decoded) + newline = '\n\n\n\n' + encoded = b64encode(newline.encode('utf-8')) + decoded = b64decode(encoded) + self.assertEqual(newline, decoded.decode('utf-8')) + + def test_capnp_message (self): + msg = dmsg.DDRegisterClient(42, "HelloWorld", "MiskaIsAdorable") + ser = msg.serialize() + + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.DDRegisterClient) + + def test_ddict_client_response_message(self): + msg = dmsg.DDRegisterClientResponse(42, 43, DragonError.SUCCESS, 0, 2, 'this is dragon error info') + ser = msg.serialize() + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.DDRegisterClientResponse) + + def test_bringup_teardown(self): + d = DDict(2,1,3000000) + d.destroy() + + def test_detach_client(self): + d = DDict(2,1,3000000) + d.detach() + d.destroy() + + def test_put_and_get(self): + d = DDict(2,1,3000000) + + d['abc'] = 'def' + x = d['abc'] + self.assertEqual(d['abc'], 'def') + + d[123] = '456' + x = d[123] + self.assertEqual(d[123], '456') + + d[(12,34,56)] = [1,2,3,4,5,6] + y = d[(12,34,56)] + y1 = d[(12,34,56)] # test if the key-value can be requested twice or more + y2 = d[(12,34,56)] + self.assertEqual(y, [1,2,3,4,5,6]) + self.assertEqual(y1, [1,2,3,4,5,6]) + self.assertEqual(y2, [1,2,3,4,5,6]) + self.assertEqual(d[(12,34,56)], [1,2,3,4,5,6]) + + try: + y = d['hello'] + raise AttributeError('Expected KeyError not raised') + except KeyError: + pass + + d.destroy() + + def test_pop(self): + d = DDict(2,1,3000000) + d['abc'] = 'def' + x = d.pop('abc') + self.assertEqual(x, 'def') + self.assertRaises(KeyError, d.pop, 'abc') + + d[123] = 456 + del d[123] + self.assertRaises(KeyError, d.pop, 123) + + d[(12,34,56)] = [1,2,3,4,5,6] + x = d.pop((12,34,56)) + self.assertEqual(x, [1,2,3,4,5,6]) + self.assertRaises(KeyError, d.pop, (12,34,56)) + + d.destroy() + + def test_contains_key(self): + d = DDict(1,1,3000000) + d['abc'] = 'def' + self.assertTrue('abc' in d) # test existence of the added key + self.assertFalse(123 in d) # test existence if the key is never added + d[123] = 456 + self.assertTrue(123 in d) + d.pop(123) + self.assertFalse(123 in d) # test existence of a poped key + d.pop('abc') + self.assertFalse('abc' in d) # test existence of a poped key + + # test tuple key and value + d[(1,2,3,4,5)] = [6,7,8,9,10] + self.assertTrue((1,2,3,4,5) in d) + del d[(1,2,3,4,5)] + self.assertFalse((1,2,3,4,5) in d) + + d.destroy() + + def test_len(self): + d = DDict(2,1,3000000) + self.assertEqual(len(d), 0) + d['abc'] = 'def' + self.assertEqual(len(d), 1) + d[123] = 456 + self.assertEqual(len(d), 2) + d[(1,2,3,4,5)] = [6,7,8,9,10] + self.assertEqual(len(d), 3) + d.pop('abc') + self.assertEqual(len(d), 2) + d.pop(123) + self.assertEqual(len(d), 1) + d.pop((1,2,3,4,5)) + self.assertEqual(len(d), 0) + d.destroy() + + def test_clear(self): + d = DDict(2,1,3000000) + d['abc'] = 'def' + d[123] = 456 + d[(1,2,3,4,5)] = [6,7,8,9,10] + self.assertEqual(len(d), 3) + d.clear() + self.assertEqual(len(d), 0) + d.clear() # test clearing an empty dictionary + self.assertEqual(len(d), 0) + d['hello'] = 'world' + d.clear() + self.assertEqual(len(d), 0) + d.destroy() + + @unittest.skip("Not yet implemented.") + def test_iter(self): + try: + d = DDict(2,1,3000000) + k = ['abc', 98765, 'hello', (1,2,3,4,5)] + v = ['def', 200, 'world', ['a',1,3,5,'b']] + for i, key in enumerate(k): + d[key] = v[i] + + for i in d: + if i == "abc": + self.assertEqual(d[i], 'def') + elif i == 98765: + self.assertEqual(d[i], 200) + elif i == 'hello': + self.assertEqual(d[i], 'world') + elif i == (1,2,3,4,5): + self.assertEqual(d[i], ['a',1,3,5,'b']) + else: + raise RuntimeError(f'Get the key which is not added by client: key={i}') + + iter_d = iter(d) + ddict_keys = [] + while True: + try: + ddict_keys.append(next(iter_d)) + except StopIteration: + del iter_d + break + for key in k: + self.assertTrue(key in ddict_keys) + + d.destroy() + except Exception as e: + tb = traceback.format_exc() + raise Exception(f'Exception caught {e}\n Traceback: {tb}') + + def test_keys(self): + d = DDict(2, 1, 3000000) + k = ['abc', 98765, 'hello', (1,2,3,4,5)] + v = ['def', 200, 'world', ['a',1,3,5,'b']] + for i, key in enumerate(k): + d[key] = v[i] + ddict_keys = d.keys() + for key in k: + self.assertTrue(key in ddict_keys) + d.destroy() + + def test_fill(self): + d = DDict(1, 1, 900000) + self.assertRaises(DDictManagerFull, fillit, d) + d.destroy() + +if __name__ == "__main__": + mp.set_start_method('dragon') + unittest.main() diff --git a/test/native/test_msgs.py b/test/native/test_msgs.py new file mode 100644 index 0000000..fceacbe --- /dev/null +++ b/test/native/test_msgs.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import unittest +import os +import dragon +import sys +import multiprocessing as mp +from dragon.fli import FLInterface, DragonFLIError, FLIEOT +from dragon.managed_memory import MemoryPool, MemoryAlloc +from dragon.channels import Channel +from dragon.globalservices import channel +from dragon.localservices.options import ChannelOptions +from dragon.native.process import Popen +import dragon.infrastructure.messages as dmsg +import dragon.infrastructure.facts as facts +import dragon.infrastructure.parameters as parameters +import dragon.utils as du + +class FLISendRecvTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + mp.set_start_method('dragon') + + @classmethod + def tearDownClass(cls): + pass + + def setUp(self): + self._default_muid = facts.default_pool_muid_from_index(parameters.this_process.index) + + cdesc = channel.create(self._default_muid) + self.main_ch = Channel.attach(cdesc.sdesc) + + cdesc = channel.create(self._default_muid) + self.manager_ch = Channel.attach(cdesc.sdesc) + + self.stream_chs = [] + for i in range(5): + cdesc = channel.create(self._default_muid) + strm_ch = Channel.attach(cdesc.sdesc) + self.stream_chs.append(strm_ch) + + self.fli = FLInterface(main_ch=self.main_ch, manager_ch=self.manager_ch, stream_channels=self.stream_chs) + + def tearDown(self): + self.fli.destroy() + for i in range(5): + channel.destroy(self.stream_chs[i].cuid) + + + def test_receive_from_python(self): + fli_ser = self.fli.serialize() + b64fli_ser = du.b64encode(fli_ser) + path = os.path.abspath('flimsgfrom') + proc = Popen(executable=sys.executable, args=['flimsgfrom.py', b64fli_ser], stdout=Popen.PIPE) + recvh = self.fli.recvh() + ser_msg, arg = recvh.recv_bytes() + msg = dmsg.parse(ser_msg) + self.assertEqual(msg.tc, dmsg.MessageTypes.DD_REGISTER_CLIENT) + self.assertEqual(msg.respFLI, "Hello World") + status = proc.stdout.recv().strip() + self.assertEqual(status, 'OK') + + + def test_receive_from_cpp(self): + fli_ser = self.fli.serialize() + b64fli_ser = du.b64encode(fli_ser) + path = os.path.abspath('flimsgfrom') + proc = Popen(executable=path, args=[b64fli_ser], stdout=Popen.PIPE) + recvh = self.fli.recvh() + ser_msg, arg = recvh.recv_bytes() + msg = dmsg.parse(ser_msg) + self.assertEqual(msg.tc, dmsg.MessageTypes.DD_REGISTER_CLIENT) + self.assertEqual(msg.respFLI, "Hello World") + self.assertEqual(msg.bufferedRespFLI, "Dragon is the best") + status = proc.stdout.recv().strip() + self.assertEqual(status, 'OK') + + + def test_send_to_cpp(self): + fli_ser = self.fli.serialize() + b64fli_ser = du.b64encode(fli_ser) + path = os.path.abspath('flimsgto') + proc = Popen(executable=path, args=[b64fli_ser], stdout=Popen.PIPE) + sendh = self.fli.sendh() + msg = dmsg.DDRegisterClient(5, "Hello World!", "Dragon is the best") + sendh.send_bytes(msg.serialize()) + sendh.close() + status = proc.stdout.recv().strip() + self.assertEqual(status, 'OK') + + def test_set_kv(self): + du.set_local_kv("Hello", "World") + value = du.get_local_kv("Hello") + self.assertEqual(value, "World") + du.set_local_kv("Hello", "") + self.assertRaises(KeyError, du.get_local_kv, 'Hello') + self.assertRaises(KeyError, du.get_local_kv, 'NoKey') + du.set_local_kv("Dragon", "") + self.assertRaises(KeyError, du.get_local_kv, 'Dragon') + + def test_get_channel(self): + ch = Channel.make_process_local() + ch.destroy() + +if __name__ == '__main__': + unittest.main() diff --git a/test/native/test_process.py b/test/native/test_process.py index 3f9d735..e57d6ea 100644 --- a/test/native/test_process.py +++ b/test/native/test_process.py @@ -5,7 +5,7 @@ import dragon from dragon.native.queue import Queue -from dragon.native.process import Process, current, TemplateProcess +from dragon.native.process import Process, current, ProcessTemplate from dragon.infrastructure.parameters import this_process @@ -110,7 +110,7 @@ def test_templating_basic(self): exe = "sleep" args = ("10000",) - templ = TemplateProcess(exe, args) + templ = ProcessTemplate(exe, args) p = Process.from_template(templ, ident="Banana") # creates a new process @@ -124,7 +124,7 @@ def test_templating_python(self): q = Queue() - templ = TemplateProcess(self.putter, args=(q,)) + templ = ProcessTemplate(self.putter, args=(q,)) func, args, kwargs = templ.get_original_python_parameters() self.assertTrue(callable(func)) self.assertIsInstance(args[0], Queue) diff --git a/test/native/test_process_group.py b/test/native/test_process_group.py index c0c4340..203fec9 100644 --- a/test/native/test_process_group.py +++ b/test/native/test_process_group.py @@ -1,15 +1,13 @@ import unittest import time import os -import sys import random import signal -import dragon -from dragon.globalservices.process import ProcessError, query, kill, signal as dragon_signal +from dragon.globalservices.process import query, kill, ProcessError from dragon.infrastructure.process_desc import ProcessOptions -from dragon.native.process import Process, TemplateProcess +from dragon.native.process import Process, ProcessTemplate from dragon.native.event import Event from dragon.native.queue import Queue from dragon.native.process_group import ProcessGroup, DragonProcessGroupError @@ -33,7 +31,7 @@ def setUpClass(cls): cls.args = ("128",) cls.cwd = os.getcwd() cls.options = ProcessOptions(make_inf_channels=True) - cls.template = TemplateProcess(cls.cmd, args=cls.args, cwd=cls.cwd) + cls.template = ProcessTemplate(cls.cmd, args=cls.args, cwd=cls.cwd) def test_init(self): @@ -58,23 +56,48 @@ def test_start_stop(self): pg.add_process(self.nproc, self.template) pg.init() - man_proc = pg._manager._proc - self.assertTrue(man_proc.is_alive) + manager = pg._manager + self.assertTrue(manager.is_alive) self.assertTrue(pg.status == "Idle") pg.start() - self.assertTrue(pg.status == "Maintain") puids = pg.puids processes = [Process(None, ident=puid) for puid in puids] - for p in processes: self.assertTrue(p.is_alive) pg.stop() + manager.join(timeout=None) # will the manager exit after stop ? + self.assertFalse(manager.is_alive) - man_proc.join(timeout=None) # will the manager exit after stop ? + def test_alive_puids(self): + + pg = ProcessGroup() + pg.add_process(self.nproc, self.template) + pg.init() + + man_proc = pg._manager._proc + self.assertTrue(man_proc.is_alive) + self.assertTrue(pg.status == "Idle") + + pg.start() + self.assertTrue(pg.status == "Maintain") + + puids = pg.puids.copy() + processes = [Process(None, ident=puid) for puid in puids] + for p in processes: + self.assertTrue(p.is_alive) + + pg.stop(save_puids=True) + # Confirm the puids have been removed from the active and moved to the inactive + puid_statuses = pg.inactive_puids + for puid, ecode in puid_statuses: + self.assertTrue(puid in puids) + self.assertEqual(ecode, -1*signal.SIGKILL.value) + + man_proc.join() self.assertFalse(man_proc.is_alive) @classmethod @@ -85,7 +108,7 @@ def test_join_from_idle(self): ev = Event() - template = TemplateProcess(self.event_quitter, args=(ev,), cwd=".") + template = ProcessTemplate(self.event_quitter, args=(ev,), cwd=".") pg = ProcessGroup(restart=False) pg.add_process(self.nproc, template) pg.init() @@ -93,7 +116,6 @@ def test_join_from_idle(self): self.assertTrue(pg.status == "Idle") pg.start() - self.assertTrue(pg.status == "Running") puids = pg.puids @@ -114,10 +136,11 @@ def test_join_from_idle(self): self.assertTrue(pg.status == "Running") time.sleep(0.1) # avoid race condition - puids = pg.puids - for puid in puids: - self.assertIsNone(puid) - + # Make sure all the puids have exited with 0 exit codes + exit_states = pg.inactive_puids + for puid, exit_code in exit_states: + self.assertTrue(puid in puids) + self.assertEqual(exit_code, 0) pg.stop() def test_join_from_maintain(self): @@ -125,7 +148,7 @@ def test_join_from_maintain(self): ev = Event() - template = TemplateProcess(self.event_quitter, args=(ev,), cwd=".") + template = ProcessTemplate(self.event_quitter, args=(ev,), cwd=".") pg = ProcessGroup(restart=True) pg.add_process(self.nproc, template) pg.init() @@ -138,7 +161,7 @@ def test_join_from_maintain(self): puids = pg.puids processes = [Process(None, ident=puid) for puid in puids] - self.assertRaises(TimeoutError, pg.join, 0) + self.assertRaises(TimeoutError, pg.join, 0, True) self.assertTrue(pg.status == "Running") for p in processes: @@ -150,19 +173,24 @@ def test_join_from_maintain(self): p.join() # test autotransition - state_transitioned=False + state_transitioned = False while not state_transitioned: # have to call pg.status once so it doesn't change between checks status = pg.status if status == "Idle": - state_transitioned=True + state_transitioned = True else: self.assertTrue(status == "Running", f"status supposed to be running, it is {status}") time.sleep(0.1) # keeps the loop from being too hot - puids = pg.puids - for puid in puids: - self.assertIsNone(puid) + # Make sure all the puids have exited with 0 exit codes + exit_states = pg.exit_status + active_puids = pg.puids + self.assertTrue(all(puid == 0 for puid in active_puids)) + for puid, exit_code in exit_states: + self.assertTrue(puid in puids) + self.assertEqual(exit_code, 0) + pg.stop() @unittest.skip("This one is very slow for reasons I don't understand.") @@ -170,7 +198,6 @@ def test_join_with_timeout(self): ev = Event() - template = TemplateProcess(self.event_quitter, args=(ev,), cwd=".") pg = ProcessGroup(restart=True) pg.add_process(3, self.template) pg.init() @@ -246,7 +273,7 @@ def test_shutdown_from_maintain(self): q = Queue() - template = TemplateProcess(self._putters, args=(q,), cwd=".") + template = ProcessTemplate(self._putters, args=(q,), cwd=".") pg = ProcessGroup() pg.add_process(self.nproc, template) pg.init() @@ -262,14 +289,18 @@ def test_shutdown_from_maintain(self): for _ in puids: __ = q.get() # make sure they've all executed code - pg.kill(signal.SIGTERM) + pg.kill(signal.SIGTERM, save_puids=True) while not pg.status == "Idle": self.assertTrue(pg.status == "Running") time.sleep(0.1) - for puid in pg.puids: - self.assertIsNone(puid) + exit_states = pg.exit_status + active_puids = pg.puids + self.assertTrue(all(puid == 0 for puid in active_puids)) + for puid, exit_code in exit_states: + self.assertTrue(puid in puids) + self.assertEqual(exit_code, 0) for puid in puids: # check if really dead gs_info = query(puid) @@ -281,7 +312,7 @@ def test_shutdown_from_join(self): q = Queue() - template = TemplateProcess(self._putters, args=(q,), cwd=".") + template = ProcessTemplate(self._putters, args=(q,), cwd=".") pg = ProcessGroup(restart=False) pg.add_process(self.nproc, template) pg.init() @@ -294,14 +325,19 @@ def test_shutdown_from_join(self): puids = pg.puids - pg.kill(signal.SIGTERM) + pg.kill(signal.SIGTERM, save_puids=True) while not pg.status == "Idle": self.assertTrue(pg.status == "Running") time.sleep(0.1) - for puid in pg.puids: - self.assertIsNone(puid) + # Make sure all the puids have exited with SIGTERM exit codes + exit_states = pg.exit_status + active_puids = pg.puids + self.assertTrue(all(puid == 0 for puid in active_puids)) + for puid, exit_code in exit_states: + self.assertTrue(puid in puids) + self.assertEqual(exit_code, -1 * signal.SIGTERM.value) for puid in puids: # check if really dead gs_info = query(puid) @@ -309,6 +345,43 @@ def test_shutdown_from_join(self): pg.stop() + def test_stop_from_maintain(self): + + pg = ProcessGroup() + pg.add_process(self.nproc, self.template) + pg.init() + + self.assertTrue(pg.status == "Idle") + man_proc = pg._manager._proc + self.assertTrue(man_proc.is_alive) + + pg.start() + + self.assertTrue(pg.status == "Maintain") + + puids = pg.puids + + pg.stop(save_puids=True) + man_proc.join(timeout=None) # will the manager exit after stop ? + self.assertFalse(man_proc.is_alive) + + self.assertTrue(pg.status == "Idle") + + # Make sure all the puids have exited with SIGKILL exit codes + exit_states = pg.exit_status + active_puids = pg.puids + self.assertTrue(all(puid == 0 for puid in active_puids)) + for puid, exit_code in exit_states: + self.assertTrue(puid in puids) + self.assertEqual(exit_code, -1 * signal.SIGKILL.value) + + for puid in puids: + gs_info = query(puid) + self.assertTrue(gs_info.state == gs_info.State.DEAD) + + pg.stop() + + def test_kill_from_maintain(self): pg = ProcessGroup() @@ -329,8 +402,13 @@ def test_kill_from_maintain(self): self.assertTrue(pg.status == "Idle") - for puid in pg.puids: - self.assertIsNone(puid) + # Make sure all the puids have exited with SIGKILL exit codes + exit_states = pg.exit_status + active_puids = pg.puids + self.assertTrue(all(puid == 0 for puid in active_puids)) + for puid, exit_code in exit_states: + self.assertTrue(puid in puids) + self.assertEqual(exit_code, -1 * signal.SIGKILL.value) for puid in puids: gs_info = query(puid) @@ -340,7 +418,7 @@ def test_kill_from_maintain(self): def test_kill_from_join(self): - pg = ProcessGroup(restart=False) + pg = ProcessGroup(restart=False, ignore_error_on_exit=True) pg.add_process(self.nproc, self.template) pg.init() @@ -377,7 +455,7 @@ def test_no_error_from_maintain(self): ev = Event() - template = TemplateProcess(self._failer, args=(ev,), cwd=".") + template = ProcessTemplate(self._failer, args=(ev,), cwd=".") pg = ProcessGroup(restart=True) pg.add_process(self.nproc, template) pg.init() @@ -417,7 +495,7 @@ def test_error_and_kill_from_join(self): ev = Event() - template = TemplateProcess(self._failer, args=(ev,), cwd=".") + template = ProcessTemplate(self._failer, args=(ev,), cwd=".") pg = ProcessGroup(restart=False) pg.add_process(self.nproc, template) pg.init() @@ -450,7 +528,7 @@ def test_error_and_stop_from_join(self): ev = Event() - template = TemplateProcess(self._failer, args=(ev,), cwd=".") + template = ProcessTemplate(self._failer, args=(ev,), cwd=".") pg = ProcessGroup(restart=False) pg.add_process(self.nproc, template) pg.init() @@ -499,8 +577,8 @@ def test_ignore_error_during_shutdown(self): for puid in puids: # force kill all processes try: - kill(puid, sig=dragon_signal.SIGKILL) - except Exception as e: + kill(puid, sig=signal.SIGKILL) + except ProcessError: pass while not pg.status == "Idle": @@ -512,15 +590,14 @@ def test_bad_transitions(self): pg = ProcessGroup() pg.add_process(self.nproc, self.template) - pg.init() + pg.init() self.assertTrue(pg.status == "Idle") self.assertRaises(DragonProcessGroupError, pg.kill) pg.start() self.assertTrue(pg.status == "Maintain") self.assertRaises(DragonProcessGroupError, pg.start) - self.assertRaises(TimeoutError, pg.join, 0) self.assertTrue(pg.status == "Running") self.assertRaises(DragonProcessGroupError, pg.start) @@ -534,18 +611,80 @@ def test_bad_transitions(self): self.assertRaises(DragonProcessGroupError, pg.kill) self.assertRaises(DragonProcessGroupError, pg.kill, signal.SIGTERM) + def test_inactive_puid_max_size(self): + # Kill enough processes that the initial inactive_puid array size limit is hit and recover + # from it gracefully + + # set a sleep time that is long enough for the kill loop to finish + # before all the procs exit + template = ProcessTemplate("sleep", args=(128,), cwd=".") + + nworkers = 4 + pg = ProcessGroup(ignore_error_on_exit=True) # We're going to SIGKILL or SIGTERM everything. Don't raise an exception on it. + pg.add_process(nworkers, template) + pg.init() + man_proc = pg._manager._proc + self._update_interval_sec = 0.1 + + pg.start() + + killed_puids = [] + + while len(killed_puids) < 2 * nworkers: + puids = pg.puids + try: + puid = puids[random.randint(0, len(puids) - 1)] + if puid != 0: + kill(puid, sig=signal.SIGKILL) + killed_puids.append(puid) + else: + continue + except ProcessError: # maybe it disappeared already + pass + + self.assertTrue(man_proc.is_alive) + self.assertTrue(pg.status != "Error") + + time.sleep(0.2) + + # Check that all the killed puids appear in inactive puids with correct exit codes + inactive_puids = pg.inactive_puids + for puid, _ in inactive_puids: + self.assertTrue(puid in killed_puids) + idx = [idx for idx, kpuid in enumerate(killed_puids) if puid == kpuid] + self.assertTrue(len(idx) == 1) + killed_puids.pop(idx[0]) + + self.assertTrue(len(killed_puids) == 0) + + puids = pg.puids + + pg.kill(signal=signal.SIGTERM) + + while not pg.status == "Idle": + self.assertTrue(pg.status != "Error") + + self.assertTrue(man_proc.is_alive) + + for puid in puids: + gs_info = query(puid) + self.assertTrue(gs_info.state == gs_info.State.DEAD) + self.assertTrue(puid not in killed_puids) + pg.stop() + + man_proc.join() - @unittest.skip("Pending resolution on CIRRUS-1455") def test_maintain_stress(self): # we will keep killing processes that keep exiting. # the class has to maintain them and GS has to handle all of that. - testtime = 3 # sec - template = TemplateProcess("sleep", args=(f"{round(testtime/3)}",), cwd=".") + # set a sleep time that is long enough for the kill loop to finish + # before all the procs exit + template = ProcessTemplate("sleep", args=(f"{round(testtime)}",), cwd=".") - pg = ProcessGroup() + pg = ProcessGroup(ignore_error_on_exit=True) # We're going to SIGKILL or SIGTERM everything. Don't raise an exception on it. pg.add_process(64, template) pg.init() man_proc = pg._manager._proc @@ -555,13 +694,16 @@ def test_maintain_stress(self): beg = time.monotonic() - while time.monotonic() - beg < testtime: + while (time.monotonic() - beg) < testtime: puids = pg.puids try: - puid = puids[random.randint(0, len(puids))] - kill(puid, sig=dragon_signal.SIGKILL) - except Exception: # maybe it disappeared already + puid = puids[random.randint(0, len(puids) - 1)] + if puid != 0: + kill(puid, sig=signal.SIGKILL) + else: + continue + except ProcessError: # maybe it disappeared already pass self.assertTrue(man_proc.is_alive) @@ -578,12 +720,60 @@ def test_maintain_stress(self): self.assertTrue(man_proc.is_alive) + for puid in puids: + if puid != 0: + gs_info = query(puid) + self.assertTrue(gs_info.state == gs_info.State.DEAD) + + pg.stop() + + man_proc.join() + + def test_maintain_clean_exit_with_restarts(self): + + # we will keep killing processes that keep exiting. + # the class has to maintain them and GS has to handle all of that. + testtime = 3 # sec + + # set a sleep time that is long enough for the kill loop to finish + # before all the procs exit + template = ProcessTemplate("sleep", args=(f"{round(testtime)}",), cwd=".") + + pg = ProcessGroup(ignore_error_on_exit=True) # We're going to SIGKILL or SIGTERM everything. Don't raise an exception on it. + pg.add_process(64, template) + pg.init() + man_proc = pg._manager._proc + self._update_interval_sec = 0.1 + + pg.start() + + beg = time.monotonic() + while (time.monotonic() - beg) < testtime: + puids = pg.puids + + try: + puid = puids[random.randint(0, len(puids) - 1)] + kill(puid, sig=signal.SIGKILL) + except ProcessError: # maybe it disappeared already + pass + + self.assertTrue(man_proc.is_alive) + self.assertTrue(pg.status != "Error") + + time.sleep(0.2) + + puids = pg.puids + pg.join() + + while not pg.status == "Idle": + self.assertTrue(pg.status != "Error") + + self.assertTrue(man_proc.is_alive) for puid in puids: gs_info = query(puid) self.assertTrue(gs_info.state == gs_info.State.DEAD) pg.stop() - man_proc.join() def test_walltime(self): @@ -594,12 +784,11 @@ def test_walltime(self): pg.add_process(3, self.template) pg.init() - pg.start() start = time.monotonic() + pg.start() while not pg.status == "Idle": self.assertFalse(pg.status == "Error") - time.sleep(0.5) stop = time.monotonic() diff --git a/test/native/test_redirection.py b/test/native/test_redirection.py index be6807a..42cd951 100644 --- a/test/native/test_redirection.py +++ b/test/native/test_redirection.py @@ -31,6 +31,22 @@ def test_native_process_stdout(self): self.assertEqual('Hello World\n', result) proc.stdout.close() + def test_native_process_stdout2(self): + + exe = sys.executable + proc = Popen(executable=exe, args=['-c', 'print("Hello World")'], stdout=Popen.PIPE) + + result = '' + try: + while True: + data = proc.stdout.recv() + result += data + except EOFError: + pass + + self.assertEqual('Hello World\n', result) + proc.stdout.close() + def test_native_process_stdout_to_devnull(self): ''' No output should appear on the terminal from running this test. diff --git a/test/pmod/test_pmod.c b/test/pmod/test_pmod.c index 3e1930e..1b95e21 100644 --- a/test/pmod/test_pmod.c +++ b/test/pmod/test_pmod.c @@ -82,9 +82,8 @@ void create_fake_infra_pool() err = dragon_memory_pool_serialize(&mem_pool_ser, &mem_pool); pmod_assert(err == DRAGON_SUCCESS); - size_t encoded_size; - char *mem_pool_str = dragon_base64_encode(mem_pool_ser.data, mem_pool_ser.len, &encoded_size); + char *mem_pool_str = dragon_base64_encode(mem_pool_ser.data, mem_pool_ser.len); pmod_assert(mem_pool_str != NULL); setenv("DRAGON_INF_PD", mem_pool_str, 1); @@ -145,9 +144,7 @@ void get_child_ch(int child_count, dragonChannelDescr_t *child_ch) err = dragon_channel_serialize(child_ch, &child_ch_ser); pmod_assert(err == DRAGON_SUCCESS); - size_t encoded_size; - - char *child_ch_str = dragon_base64_encode(child_ch_ser.data, child_ch_ser.len, &encoded_size); + char *child_ch_str = dragon_base64_encode(child_ch_ser.data, child_ch_ser.len); setenv("DRAGON_PMOD_CHILD_CHANNEL", child_ch_str, 1); diff --git a/test/release/hello.py b/test/release/hello.py new file mode 100644 index 0000000..44159b3 --- /dev/null +++ b/test/release/hello.py @@ -0,0 +1 @@ +print("Hello world") diff --git a/test/release/test_bounce.sh b/test/release/test_bounce.sh new file mode 100755 index 0000000..ed182f1 --- /dev/null +++ b/test/release/test_bounce.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#To change the version of Python, please use the following commands: +# To find the version of Python on your system: ls -ls /usr/bin/python* +# To load the version of Python you want: `alias python=python` or `alias python='\usr\bin\python'` +# The following bounce test involves running "hello world" from 512 nodes. Dragon is stressed to see how it behaves under high load conditions. +# The following bounce timings are completed with 512 nodes. + +#./test_bounce.sh +# Cold start +# Hello world + +# real 0m26.281s +# user 0m10.906s +# sys 0m2.862s +# Warm start +# Hello world + +# real 0m17.046s +# user 0m7.506s +# sys 0m2.127s + +# The following line of code can be run to allocate the number of nodes needed: +# salloc --nodes=512 --exclusive -t 01:00:00 + +echo "Cold start" +time dragon hello.py + +echo "Warm start" +time dragon hello.py + +# #The following lines of code ensure swift clean up. +# #dragon-cleanup +# #scancel -u $USER diff --git a/test/release/test_mpi_wrkflw.sh b/test/release/test_mpi_wrkflw.sh new file mode 100755 index 0000000..8629031 --- /dev/null +++ b/test/release/test_mpi_wrkflw.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# To change the version of Python, please use the following commands: +# To find the version of Python on your system: ls -ls /usr/bin/python* +# To load the version of Python you want: `alias python=python` or `alias python='\usr\bin\python'` + +# Python 3.11 + +# NUM_WORKERS | NUM_IMAGES | NUM_BURNS | NUM_ITER | SIZE_IMG | MEM_SIZE | WORK TIME | 0TH_ITER(s) | 1ST_ITER(s) | AVG_TIME (S) | STD_DEV (s) +# 51200 | 51200 | 0 | 2 | 256 | 33554432 | 4.00 | 260.7048197858967 | 163.23564628418535|211.970233035 | 68.9211135397082 + +dragon ../../examples/multiprocessing/numpy-mpi4py-examples/scipy_scale_work.py --dragon --num_workers 51200 --mem 3355443200 --size 256 --iterations 2 --burns 0 --work_time 4 + +# To set up a multi-node environment for deployment of the script, it is recommended to pass the command `salloc --nodes=8 <--exclusive if needed> <-t hh:mm:ss>`. +# The arguments encapuslated by <> are recommended but not needed. + +# The following line of code can be run to allocate the number of nodes needed: +# salloc --nodes=400 --exclusive -t 04:00:00 + +# The following lines of code ensure swift clean up. +# dragon-cleanup +# scancel -u $USER diff --git a/test/test_distdict.py b/test/test_distdict.py new file mode 100644 index 0000000..7a7e590 --- /dev/null +++ b/test/test_distdict.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 + +import unittest +import pickle +import sys +import ctypes +import random +import string +import zlib +import json +import time +import os + +import dragon +import dragon.infrastructure.messages as dmsg +import dragon.channels as dch +from dragon.utils import b64encode, b64decode +from dragon.data.ddict.ddict import DDict +import multiprocessing as mp +import traceback +from dragon.rc import DragonError + + +class TestDDict(unittest.TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_local_channel(self): + ch = dch.Channel.make_process_local() + ch.detach() + + def test_infra_message(self): + msg = dmsg.GSHalted(42) + ser = msg.serialize() + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.GSHalted) + newser = 'eJyrVoovSVayUjA21lFQKklMBzItawE+xQWS' + from_str = dmsg.parse(newser) + self.assertIsInstance(from_str, dmsg.GSHalted) + newser = 'eJyrVoovSVayUjA21lFQKklMBzItawE+xQWS\n' + from_str = dmsg.parse(newser) + self.assertIsInstance(from_str, dmsg.GSHalted) + newline = b'\n\n\n\n' + encoded = b64encode(newline) + decoded = b64decode(encoded) + self.assertEqual(newline, decoded) + newline = '\n\n\n\n' + encoded = b64encode(newline.encode('utf-8')) + decoded = b64decode(encoded) + self.assertEqual(newline, decoded.decode('utf-8')) + + def test_capnp_message (self): + msg = dmsg.DDRegisterClient(42, "HelloWorld", "MiskaIsAdorable") + ser = msg.serialize() + + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.DDRegisterClient) + + def test_ddict_client_response_message(self): + msg = dmsg.DDRegisterClientResponse(42, 43, DragonError.SUCCESS, 0, 2, 'this is dragon error info') + ser = msg.serialize() + newmsg = dmsg.parse(ser) + self.assertIsInstance(newmsg, dmsg.DDRegisterClientResponse) + + def test_bringup_teardown(self): + d = DDict(2,1,3000000) + d.destroy() + + def test_detach_client(self): + d = DDict(2,1,3000000) + d.detach() + d.destroy() + + def test_put_and_get(self): + d = DDict(2,1,3000000) + + d['abc'] = 'def' + x = d['abc'] + self.assertEqual(d['abc'], 'def') + + d[123] = '456' + x = d[123] + self.assertEqual(d[123], '456') + + d[(12,34,56)] = [1,2,3,4,5,6] + y = d[(12,34,56)] + y1 = d[(12,34,56)] # test if the key-value can be requested twice or more + y2 = d[(12,34,56)] + self.assertEqual(y, [1,2,3,4,5,6]) + self.assertEqual(y1, [1,2,3,4,5,6]) + self.assertEqual(y2, [1,2,3,4,5,6]) + self.assertEqual(d[(12,34,56)], [1,2,3,4,5,6]) + + try: + y = d['hello'] + raise AttributeError('Expected KeyError not raised') + except KeyError: + pass + + d.destroy() + + def test_pop(self): + d = DDict(2,1,3000000) + d['abc'] = 'def' + x = d.pop('abc') + self.assertEqual(x, 'def') + self.assertRaises(KeyError, d.pop, 'abc') + + d[123] = 456 + del d[123] + self.assertRaises(KeyError, d.pop, 123) + + d[(12,34,56)] = [1,2,3,4,5,6] + x = d.pop((12,34,56)) + self.assertEqual(x, [1,2,3,4,5,6]) + self.assertRaises(KeyError, d.pop, (12,34,56)) + + d.destroy() + + def test_contains_key(self): + d = DDict(2,1,3000000) + d['abc'] = 'def' + self.assertTrue('abc' in d) # test existence of the added key + self.assertFalse(123 in d) # test existence if the key is never added + d[123] = 456 + self.assertTrue(123 in d) + d.pop(123) + self.assertFalse(123 in d) # test existence of a poped key + d.pop('abc') + self.assertFalse('abc' in d) # test existence of a poped key + + # test tuple key and value + d[(1,2,3,4,5)] = [6,7,8,9,10] + self.assertTrue((1,2,3,4,5) in d) + del d[(1,2,3,4,5)] + self.assertFalse((1,2,3,4,5) in d) + + d.destroy() + + def test_len(self): + d = DDict(2,1,3000000) + self.assertEqual(len(d), 0) + d['abc'] = 'def' + self.assertEqual(len(d), 1) + d[123] = 456 + self.assertEqual(len(d), 2) + d[(1,2,3,4,5)] = [6,7,8,9,10] + self.assertEqual(len(d), 3) + d.pop('abc') + self.assertEqual(len(d), 2) + d.pop(123) + self.assertEqual(len(d), 1) + d.pop((1,2,3,4,5)) + self.assertEqual(len(d), 0) + d.destroy() + + def test_clear(self): + d = DDict(2,1,3000000) + d['abc'] = 'def' + d[123] = 456 + d[(1,2,3,4,5)] = [6,7,8,9,10] + self.assertEqual(len(d), 3) + d.clear() + self.assertEqual(len(d), 0) + d.clear() # test clearing an empty dictionary + self.assertEqual(len(d), 0) + d['hello'] = 'world' + d.clear() + self.assertEqual(len(d), 0) + d.destroy() + + @unittest.skip('Not yet implemented') + def test_iter(self): + try: + d = DDict(2,1,3000000) + k = ['abc', 98765, 'hello', (1,2,3,4,5)] + v = ['def', 200, 'world', ['a',1,3,5,'b']] + for i, key in enumerate(k): + d[key] = v[i] + + for i in d: + if i == "abc": + self.assertEqual(d[i], 'def') + elif i == 98765: + self.assertEqual(d[i], 200) + elif i == 'hello': + self.assertEqual(d[i], 'world') + elif i == (1,2,3,4,5): + self.assertEqual(d[i], ['a',1,3,5,'b']) + else: + raise RuntimeError(f'Get the key which is not added by client: key={i}') + + iter_d = iter(d) + ddict_keys = [] + while True: + try: + ddict_keys.append(next(iter_d)) + except StopIteration: + del iter_d + break + for key in k: + self.assertTrue(key in ddict_keys) + + d.destroy() + except Exception as e: + tb = traceback.format_exc() + raise Exception(f'Exception caught {e}\n Traceback: {tb}', flush=True) + + def test_keys(self): + d = DDict(2, 1, 3000000) + k = ['abc', 98765, 'hello', (1,2,3,4,5)] + v = ['def', 200, 'world', ['a',1,3,5,'b']] + for i, key in enumerate(k): + d[key] = v[i] + ddict_keys = d.keys() + for key in k: + self.assertTrue(key in ddict_keys) + d.destroy() + +if __name__ == "__main__": + mp.set_start_method('dragon') + unittest.main() diff --git a/test/test_launcher.py b/test/test_launcher.py index f045421..d27fd40 100644 --- a/test/test_launcher.py +++ b/test/test_launcher.py @@ -9,6 +9,7 @@ from launcher.test_signal_handling import SigIntTest from launcher.test_frontend_bringup import FrontendBringUpTeardownTest from launcher.test_backend_bringup import BackendBringUpTeardownTest +from launcher.test_resilient_restart import FrontendRestartTest if __name__ == "__main__": diff --git a/test/test_mpbridge.py b/test/test_mpbridge.py index 4f96af6..e7cd5a5 100644 --- a/test/test_mpbridge.py +++ b/test/test_mpbridge.py @@ -9,6 +9,7 @@ from mpbridge.test_condition import TestCondition from mpbridge.test_lock import TestDragonLocks from mpbridge.test_process import TestMPBridgeProcess +from mpbridge.test_pool import TestMPBridgePool from mpbridge.test_api import TestMultiprocessingAPI, TestMultiprocessingInternalPatching # from mpbridge.test_barrier import TestBarrier diff --git a/test/test_native.py b/test/test_native.py index e8a35d3..58ba935 100644 --- a/test/test_native.py +++ b/test/test_native.py @@ -13,6 +13,7 @@ from native.test_lock import TestLock from native.test_queue import TestQueue from native.test_redirection import TestIORedirection +from native.test_ddict import TestDDict if __name__ == "__main__": diff --git a/test/transport/tcp/test_messages.py b/test/transport/tcp/test_messages.py index 9a03cba..81227ca 100644 --- a/test/transport/tcp/test_messages.py +++ b/test/transport/tcp/test_messages.py @@ -99,9 +99,9 @@ def setUpClass(cls): cls.msg = messages.SendRequest( seqno=1, timeout=0.5, channel_sd=b'channel desc', return_mode=messages.SendReturnMode.WHEN_BUFFERED, sendhid=uuid4(), - payload=b'payload', + payload=b'payload', hints=0, clientid=0 ) - cls.data = b'\x01\x00\x00\x00\x00\x00\x00\x00\x01?\x00\x00\x00\x00\x0cchannel desc\x02' + cls.msg.sendhid.bytes + b'\x00\x00\x00\x00\x00\x00\x00\x07payload' + cls.data = b'\x01\x00\x00\x00\x00\x00\x00\x00\x01?\x00\x00\x00\x00\x0cchannel desc\x02' + cls.msg.sendhid.bytes + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07payload' class SendMemoryRequestTestCase(TransmittableTestCase): @@ -110,9 +110,9 @@ def setUpClass(cls): cls.msg = messages.SendMemoryRequest( seqno=1, timeout=0.5, channel_sd=b'channel desc', return_mode=messages.SendReturnMode.WHEN_BUFFERED, sendhid=uuid4(), - payload=b'payload', mem_sd=b'memory desc', + payload=b'payload', mem_sd=b'memory desc', clientid=0, hints=0 ) - cls.data = b'\x02\x00\x00\x00\x00\x00\x00\x00\x01?\x00\x00\x00\x00\x0cchannel desc\x02' + cls.msg.sendhid.bytes + b'\x00\x00\x00\x00\x00\x00\x00\x07payload\x00\x0bmemory desc' + cls.data = b'\x02\x00\x00\x00\x00\x00\x00\x00\x01?\x00\x00\x00\x00\x0cchannel desc\x02' + cls.msg.sendhid.bytes + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07payload\x00\x0bmemory desc' class RecvRequestTestCase(TransmittableTestCase): @@ -150,8 +150,8 @@ class RecvResponseTestCase(TransmittableTestCase): @classmethod def setUpClass(cls): - cls.msg = messages.RecvResponse(seqno=1, payload=b'payload') - cls.data = b'\xfc\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x07payload' + cls.msg = messages.RecvResponse(seqno=1, payload=b'payload', clientid=0, hints=0) + cls.data = b'\xfc\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07payload' class EventResponseTestCase(TransmittableTestCase): diff --git a/test/transport/tcp/test_transport.py b/test/transport/tcp/test_transport.py index 8e3c431..943ae8a 100644 --- a/test/transport/tcp/test_transport.py +++ b/test/transport/tcp/test_transport.py @@ -28,7 +28,7 @@ def setUpClass(cls): cls.SendRequest = partial(messages.SendRequest, seqno=None, timeout=0.5, channel_sd=b'channel desc', return_mode=messages.SendReturnMode.WHEN_BUFFERED, sendhid=uuid4(), - payload=b'payload', + payload=b'payload', clientid=0, hints=0 ) cls.RecvRequest = partial(messages.RecvRequest, diff --git a/test/utils/test_basic_mempool.py b/test/utils/test_basic_mempool.py index 4d7a079..526b5b4 100644 --- a/test/utils/test_basic_mempool.py +++ b/test/utils/test_basic_mempool.py @@ -324,6 +324,54 @@ def test_alloc_string(self): with self.assertRaises(TypeError): _ = self.mpool.alloc("512") + def test_alloc_hash(self): + mem = self.mpool.alloc(5) + memview = mem.get_memview() + memview[:5] = b'hello' + self.assertGreaterEqual(hash(mem),0) + + def test_alloc_hash2(self): + mem = self.mpool.alloc(15) + memview = mem.get_memview() + memview[:15] = b'hellohellohello' + self.assertGreaterEqual(hash(mem),0) + + def test_alloc_equals(self): + mem = self.mpool.alloc(5) + memview = mem.get_memview() + memview[:5] = b'hello' + mem2 = self.mpool.alloc(15) + memview2 = mem2.get_memview() + memview2[:15] = b'hellohellohello' + self.assertNotEqual(mem, mem2) + + def test_alloc_eq(self): + mem = self.mpool.alloc(5) + memview = mem.get_memview() + memview[:5] = b'hello' + mem2 = self.mpool.alloc(5) + memview2 = mem2.get_memview() + memview2[:5] = b'hello' + self.assertEqual(mem,mem2) + + def test_alloc_eq2(self): + mem = self.mpool.alloc(5) + memview = mem.get_memview() + memview[:5] = b'hello' + mem2 = self.mpool.alloc(15) + memview2 = mem2.get_memview() + memview2[:5] = b'hello' + self.assertNotEqual(mem,mem2) + + def test_alloc_eq3(self): + mem = self.mpool.alloc(5) + memview = mem.get_memview() + memview[:5] = b'hello' + mem2 = self.mpool.alloc(15) + memview2 = mem2.get_memview() + memview2[:15] = b'hellohellohello' + self.assertNotEqual(mem,mem2) + def test_alloc_negative(self): with self.assertRaises(Exception): _ = self.mpool.alloc(-1)