From f98ae8b676c6f112928af22d53e3d821ee542ab5 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 08:21:09 -0700 Subject: [PATCH 1/9] libpmi: add upmi_abort() Problem: the UPMI abstract PMI interface does not implement the PMI abort function, but this is useful to avoid hangs when something goes wrong on one node during PMI bootstrap. Implement the abort function in the abstract interface and in the simple, libpmi, libpmi2, and singleton implementations. --- src/common/libpmi/upmi.c | 22 ++++++++++++++++++++++ src/common/libpmi/upmi.h | 1 + src/common/libpmi/upmi_libpmi.c | 25 ++++++++++++++++++++++++- src/common/libpmi/upmi_libpmi2.c | 25 +++++++++++++++++++++++++ src/common/libpmi/upmi_simple.c | 21 +++++++++++++++++++++ src/common/libpmi/upmi_single.c | 19 +++++++++++++++++++ 6 files changed, 112 insertions(+), 1 deletion(-) diff --git a/src/common/libpmi/upmi.c b/src/common/libpmi/upmi.c index 207855c68aa4..65aa267741da 100644 --- a/src/common/libpmi/upmi.c +++ b/src/common/libpmi/upmi.c @@ -470,6 +470,28 @@ int upmi_finalize (struct upmi *upmi, flux_error_t *errp) return 0; } +int upmi_abort (struct upmi *upmi, const char *msg, flux_error_t *errp) +{ + flux_error_t error; + + if (!upmi || !msg) { + errprintf (errp, "invalid argument\n"); + return -1; + } + if (upmi_call (upmi, + "upmi.abort", + &error, + "{s:s}", + "msg", msg) < 0) { + errprintf (errp, "%s", error.text); + upmi_trace (upmi, "abort: %s", error.text); + return -1; + } + // possibly not reached + upmi_trace (upmi, "abort: success"); + return 0; +} + int upmi_put (struct upmi *upmi, const char *key, const char *value, diff --git a/src/common/libpmi/upmi.h b/src/common/libpmi/upmi.h index 999f2656fc69..5ed65d5b5da3 100644 --- a/src/common/libpmi/upmi.h +++ b/src/common/libpmi/upmi.h @@ -55,6 +55,7 @@ int upmi_get (struct upmi *upmi, flux_error_t *error); int upmi_barrier (struct upmi *upmi, flux_error_t *error); +int upmi_abort (struct upmi *upmi, const char *msg, flux_error_t *error); #endif /* !_LIBPMI_UPMI_H */ diff --git a/src/common/libpmi/upmi_libpmi.c b/src/common/libpmi/upmi_libpmi.c index a9afc29b7c83..64782ff4d8fe 100644 --- a/src/common/libpmi/upmi_libpmi.c +++ b/src/common/libpmi/upmi_libpmi.c @@ -29,6 +29,7 @@ struct plugin_ctx { void *dso; int (*init) (int *spawned); int (*finalize) (void); + int (*abort) (int exit_code, const char *error_msg); int (*get_size) (int *size); int (*get_rank) (int *rank); int (*barrier) (void); @@ -122,7 +123,8 @@ static struct plugin_ctx *plugin_ctx_create (const char *path, || !(ctx->kvs_get_my_name = dlsym (ctx->dso, "PMI_KVS_Get_my_name")) || !(ctx->kvs_put = dlsym (ctx->dso, "PMI_KVS_Put")) || !(ctx->kvs_commit = dlsym (ctx->dso, "PMI_KVS_Commit")) - || !(ctx->kvs_get = dlsym (ctx->dso, "PMI_KVS_Get"))) { + || !(ctx->kvs_get = dlsym (ctx->dso, "PMI_KVS_Get")) + || !(ctx->abort = dlsym (ctx->dso, "PMI_Abort"))) { errprintf (error, "%s: missing required PMI_* symbols", path); goto error; } @@ -215,6 +217,26 @@ static int op_barrier (flux_plugin_t *p, return 0; } +static int op_abort (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args, + void *data) +{ + struct plugin_ctx *ctx = flux_plugin_aux_get (p, plugin_name); + int result; + const char *msg; + + if (flux_plugin_arg_unpack (args, + FLUX_PLUGIN_ARG_IN, + "{s:s}", + "msg", &msg) < 0) + return upmi_seterror (p, args, "error unpacking abort arguments"); + result = ctx->abort (1, msg); + if (result != PMI_SUCCESS) + return upmi_seterror (p, args, "%s", pmi_strerror (result)); + return 0; +} + static int op_initialize (flux_plugin_t *p, const char *topic, flux_plugin_arg_t *args, @@ -313,6 +335,7 @@ static const struct flux_plugin_handler optab[] = { { "upmi.put", op_put, NULL }, { "upmi.get", op_get, NULL }, { "upmi.barrier", op_barrier, NULL }, + { "upmi.abort", op_abort, NULL }, { "upmi.initialize", op_initialize, NULL }, { "upmi.finalize", op_finalize, NULL }, { "upmi.preinit", op_preinit, NULL }, diff --git a/src/common/libpmi/upmi_libpmi2.c b/src/common/libpmi/upmi_libpmi2.c index 5a8954d6c095..e160746dccfc 100644 --- a/src/common/libpmi/upmi_libpmi2.c +++ b/src/common/libpmi/upmi_libpmi2.c @@ -34,6 +34,7 @@ struct plugin_ctx { void *dso; int (*init) (int *spawned, int *size, int *rank, int *appnum); int (*finalize) (void); + int (*abort) (int flag, const char *msg); int (*job_getid) (char *jobid, int jobid_size); int (*kvs_put) (const char *key, const char *value); int (*kvs_fence) (void); @@ -129,6 +130,7 @@ static struct plugin_ctx *plugin_ctx_create (const char *path, goto error; if (!(ctx->init = dlsym (ctx->dso, "PMI2_Init")) || !(ctx->finalize = dlsym (ctx->dso, "PMI2_Finalize")) + || !(ctx->abort = dlsym (ctx->dso, "PMI2_Abort")) || !(ctx->job_getid = dlsym (ctx->dso, "PMI2_Job_GetId")) || !(ctx->kvs_put = dlsym (ctx->dso, "PMI2_KVS_Put")) || !(ctx->kvs_fence = dlsym (ctx->dso, "PMI2_KVS_Fence")) @@ -319,6 +321,28 @@ static int op_barrier (flux_plugin_t *p, return 0; } +static int op_abort (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args, + void *data) + +{ + struct plugin_ctx *ctx = flux_plugin_aux_get (p, plugin_name); + int flag = 1; // abort all processes in the job + const char *msg; + int result; + + if (flux_plugin_arg_unpack (args, + FLUX_PLUGIN_ARG_IN, + "{s:s}", + "msg", &msg) < 0) + return upmi_seterror (p, args, "error unpacking abort arguments"); + result = ctx->abort (flag, msg); + if (result != PMI2_SUCCESS) + return upmi_seterror (p, args, "%s", pmi_strerror (result)); + return 0; +} + static int op_initialize (flux_plugin_t *p, const char *topic, flux_plugin_arg_t *args, @@ -422,6 +446,7 @@ static const struct flux_plugin_handler optab[] = { { "upmi.put", op_put, NULL }, { "upmi.get", op_get, NULL }, { "upmi.barrier", op_barrier, NULL }, + { "upmi.abort", op_abort, NULL }, { "upmi.initialize", op_initialize, NULL }, { "upmi.finalize", op_finalize, NULL }, { "upmi.preinit", op_preinit, NULL }, diff --git a/src/common/libpmi/upmi_simple.c b/src/common/libpmi/upmi_simple.c index d3a937c8903f..82be7f2e5e11 100644 --- a/src/common/libpmi/upmi_simple.c +++ b/src/common/libpmi/upmi_simple.c @@ -126,6 +126,26 @@ static int op_barrier (flux_plugin_t *p, return 0; } +static int op_abort (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args, + void *data) +{ + struct plugin_ctx *ctx = flux_plugin_aux_get (p, plugin_name); + const char *msg; + int result; + + if (flux_plugin_arg_unpack (args, + FLUX_PLUGIN_ARG_IN, + "{s:s}", + "msg", &msg) < 0) + return upmi_seterror (p, args, "error unpacking abort arguments"); + result = pmi_simple_client_abort (ctx->client, 1, msg); + if (result != PMI_SUCCESS) + return upmi_seterror (p, args, "%s", pmi_strerror (result)); + return 0; +} + static int op_initialize (flux_plugin_t *p, const char *topic, flux_plugin_arg_t *args, @@ -195,6 +215,7 @@ static const struct flux_plugin_handler optab[] = { { "upmi.put", op_put, NULL }, { "upmi.get", op_get, NULL }, { "upmi.barrier", op_barrier, NULL }, + { "upmi.abort", op_abort, NULL }, { "upmi.initialize", op_initialize, NULL }, { "upmi.finalize", op_finalize, NULL }, { "upmi.preinit", op_preinit, NULL }, diff --git a/src/common/libpmi/upmi_single.c b/src/common/libpmi/upmi_single.c index 40c4e577c370..58a3e63c91f1 100644 --- a/src/common/libpmi/upmi_single.c +++ b/src/common/libpmi/upmi_single.c @@ -103,6 +103,24 @@ static int op_barrier (flux_plugin_t *p, return 0; } +static int op_abort (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args, + void *data) +{ + const char *msg; + + if (flux_plugin_arg_unpack (args, + FLUX_PLUGIN_ARG_IN, + "{s:s}", + "msg", &msg) < 0) + return upmi_seterror (p, args, "error unpacking abort arguments"); + fprintf (stderr, "%s\n", msg); + exit (1); + //NOTREACHED + return 0; +} + static int op_initialize (flux_plugin_t *p, const char *topic, flux_plugin_arg_t *args, @@ -148,6 +166,7 @@ static const struct flux_plugin_handler optab[] = { { "upmi.put", op_put, NULL }, { "upmi.get", op_get, NULL }, { "upmi.barrier", op_barrier, NULL }, + { "upmi.abort", op_abort, NULL }, { "upmi.initialize", op_initialize, NULL }, { "upmi.finalize", op_finalize, NULL }, { "upmi.preinit", op_preinit, NULL }, From 93704002bfa52ee6483332e90565915c4da7fd4b Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 09:23:20 -0700 Subject: [PATCH 2/9] broker: call upmi_abort() on PMI bootstrap error Problem: the instance hangs during startup if a bind address cannot be determined. If a fatal error occurs during PMI bootstrap on some but not all ranks, some brokers may block forever in the PMI barrier. Call the PMI abort function when something goes wrong during PMI bootstrap. Fixes #6278 --- src/broker/boot_pmi.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/broker/boot_pmi.c b/src/broker/boot_pmi.c index 5546d79532db..3786ee9dcee9 100644 --- a/src/broker/boot_pmi.c +++ b/src/broker/boot_pmi.c @@ -261,7 +261,8 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) } if (upmi_initialize (upmi, &info, &error) < 0) { log_msg ("%s: initialize: %s", upmi_describe (upmi), error.text); - goto error; + upmi_destroy (upmi); + return -1; } if (set_instance_level_attr (upmi, info.name, attrs) < 0) { log_err ("set_instance_level_attr"); @@ -506,6 +507,13 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) topology_decref (topo); return 0; error: + /* We've logged error to stderr before getting here so the fatal + * error message passed to the PMI server does not necessarily need + * to be highly detailed. Some implementations of abort may not + * return. + */ + if (upmi_abort (upmi, "fatal bootstrap error", &error) < 0) + log_msg ("upmi_abort: %s", error.text); free (bizcard); upmi_destroy (upmi); hostlist_destroy (hl); From cd92cd3a7efbba837708012f48c99699d0769b5b Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 10:36:04 -0700 Subject: [PATCH 3/9] flux-start: fix whitespace issues Problem: some flux-start.c code does not conform to project norms. Break long parameter lists to one per line. --- src/cmd/flux-start.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cmd/flux-start.c b/src/cmd/flux-start.c index ad2f9d7db89f..71ed027cfec9 100644 --- a/src/cmd/flux-start.c +++ b/src/cmd/flux-start.c @@ -493,15 +493,19 @@ static void pmi_debug_trace (void *client, const char *buf) fprintf (stderr, "%d: %s", cli->rank, buf); } -int pmi_kvs_put (void *arg, const char *kvsname, - const char *key, const char *val) +int pmi_kvs_put (void *arg, + const char *kvsname, + const char *key, + const char *val) { zhash_update (ctx.pmi.kvs, key, xstrdup (val)); zhash_freefn (ctx.pmi.kvs, key, (zhash_free_fn *)free); return 0; } -int pmi_kvs_get (void *arg, void *client, const char *kvsname, +int pmi_kvs_get (void *arg, + void *client, + const char *kvsname, const char *key) { char *v = zhash_lookup (ctx.pmi.kvs, key); From 7254e54856702f76972c7690574ec58106b840d8 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 10:43:39 -0700 Subject: [PATCH 4/9] flux-start: implement PMI abort callback Problem: if the flux broker calls the PMI abort function, flux-start (the PMI server) is not notified. Add an abort callback that logs a message and asks all subprocesses to terminate immediately. Update tbon.interface-hint test that was expecting broker to exit 1 on a bad hint. The broker is now terminated with SIGKILL. Update another tbon.interface-hint test that required a ratcheted down timeout. That test now fails immediately. --- src/cmd/flux-start.c | 23 +++++++++++++++++++++++ t/t0001-basic.t | 4 ++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/cmd/flux-start.c b/src/cmd/flux-start.c index 71ed027cfec9..cdfa735ff4e7 100644 --- a/src/cmd/flux-start.c +++ b/src/cmd/flux-start.c @@ -514,6 +514,28 @@ int pmi_kvs_get (void *arg, return 0; } +void pmi_abort (void *arg, + void *client, + int exit_code, + const char *error_message) +{ + struct client *cli = client; + + log_msg ("%d: PMI_Abort()%s%s", + cli->rank, + error_message ? ": " : "", + error_message ? error_message : ""); + + cli = zlist_first (ctx.clients); + while (cli) { + if (cli->p) { + flux_future_t *f = flux_subprocess_kill (cli->p, SIGKILL); + flux_future_destroy (f); + } + cli = zlist_next (ctx.clients); + } +} + int execvp_argz (char *argz, size_t argz_len) { char **av = malloc (sizeof (char *) * (argz_count (argz, argz_len) + 1)); @@ -719,6 +741,7 @@ void pmi_server_initialize (int flags) struct taskmap *map; const char *mode = optparse_get_str (ctx.opts, "test-pmi-clique", "single"); struct pmi_simple_ops ops = { + .abort = pmi_abort, .kvs_put = pmi_kvs_put, .kvs_get = pmi_kvs_get, .barrier_enter = NULL, diff --git a/t/t0001-basic.t b/t/t0001-basic.t index 7aea2395bc33..bc8e0b9b4027 100755 --- a/t/t0001-basic.t +++ b/t/t0001-basic.t @@ -476,7 +476,7 @@ test_expect_success 'TOML tcp.interface-hint=wrong type fails' ' ' test_expect_success 'tcp.interface-hint=badiface fails' ' test_expect_code 137 flux start -o,-Stbon.interface-hint=badiface \ - ${ARGS} --test-exit-timeout=1s -s2 -o,-Stbon.prefertcp=1 true + ${ARGS} -s2 -o,-Stbon.prefertcp=1 true ' test_expect_success 'tcp.interface-hint=default-route works' ' flux start -o,-Stbon.interface-hint=default-route,-Stbon.prefertcp=1 \ @@ -487,7 +487,7 @@ test_expect_success 'tcp.interface-hint=hostname works' ' ${ARGS} -s2 true ' test_expect_success 'tbon.endpoint cannot be set' ' - test_must_fail flux start ${ARGS} -s2 \ + test_expect_code 137 flux start ${ARGS} -s2 \ -o,--setattr=tbon.endpoint=ipc:///tmp/customflux /bin/true ' test_expect_success 'tbon.parent-endpoint cannot be read on rank 0' ' From ae4226bb87706bdadf8c7a678819b862dbeb1b61 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 11:03:14 -0700 Subject: [PATCH 5/9] flux-pmi: add barrier --abort option Problem: there is no way to test the UPMI abort function without starting a Flux instance. Add an --abort=RANK option to 'flux pmi barrier'. The specified rank calls the abort function instead of the barrier. --- src/cmd/builtin/pmi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/cmd/builtin/pmi.c b/src/cmd/builtin/pmi.c index a57dec868284..a2522cd4df35 100644 --- a/src/cmd/builtin/pmi.c +++ b/src/cmd/builtin/pmi.c @@ -20,6 +20,7 @@ #include "src/common/libpmi/upmi.h" #include "src/common/libutil/monotime.h" #include "src/common/libutil/log.h" +#include "src/common/libutil/errprintf.h" #include "ccan/str/str.h" static struct upmi *upmi; @@ -59,6 +60,7 @@ static int internal_cmd_barrier (optparse_t *p, int argc, char *argv[]) { int n = optparse_option_index (p); int count = optparse_get_int (p, "count", 1); + int abort = optparse_get_int (p, "abort", -1); struct timespec t; const char *label; flux_error_t error; @@ -78,6 +80,17 @@ static int internal_cmd_barrier (optparse_t *p, int argc, char *argv[]) if (upmi_barrier (upmi, &error) < 0) log_msg_exit ("barrier: %s", error.text); + // abort one rank if --abort was specified + if (abort != -1) { + if (info.rank == abort) { + flux_error_t e; + errprintf (&e, "flux-pmi: rank %d is aborting", info.rank); + if (upmi_abort (upmi, e.text, &error) < 0) { + log_msg_exit ("abort: %s", error.text); + } + } + } + while (count-- > 0) { monotime (&t); if (upmi_barrier (upmi, &error) < 0) @@ -216,6 +229,8 @@ static int cmd_pmi (optparse_t *p, int argc, char *argv[]) static struct optparse_option barrier_opts[] = { { .name = "count", .has_arg = 1, .arginfo = "N", .usage = "Execute N barrier operations (default 1)", }, + { .name = "abort", .has_arg = 1, .arginfo = "RANK", + .usage = "RANK calls abort instead of barrier", }, OPTPARSE_TABLE_END, }; static struct optparse_option get_opts[] = { From 47cb0285e62a3ed7cc7919d1a8285652f64ccce1 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 11:07:03 -0700 Subject: [PATCH 6/9] shell/pmi: fix whitespace issues Problem: the shell pmi plugin includes code that does not conform to project norms. Break long parameter lists to one per line. Indent function parameters to the same level. --- src/shell/pmi/pmi.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/shell/pmi/pmi.c b/src/shell/pmi/pmi.c index f7ffffbdfc9f..94d7428f59b5 100644 --- a/src/shell/pmi/pmi.c +++ b/src/shell/pmi/pmi.c @@ -102,7 +102,8 @@ static void shell_pmi_abort (void *arg, * This allows the shell to continue to process events and stdio * until the exec system terminates the job due to the exception. */ - flux_shell_raise ("exec", 0, + flux_shell_raise ("exec", + 0, "MPI_Abort%s%s", msg ? ": " : "", msg ? msg : ""); @@ -285,17 +286,17 @@ static void exchange_cb (struct pmi_exchange *pex, void *arg) /* pmi_simple_ops->kvs_get() signature */ static int exchange_kvs_get (void *arg, - void *cli, - const char *kvsname, - const char *key) + void *cli, + const char *kvsname, + const char *key) { struct shell_pmi *pmi = arg; json_t *o; const char *val = NULL; if ((o = json_object_get (pmi->locals, key)) - || (o = json_object_get (pmi->pending, key)) - || (o = json_object_get (pmi->global, key))) { + || (o = json_object_get (pmi->pending, key)) + || (o = json_object_get (pmi->global, key))) { val = json_string_value (o); pmi_simple_server_kvs_get_complete (pmi->server, cli, val); return 0; @@ -324,9 +325,9 @@ static int exchange_barrier_enter (void *arg) /* pmi_simple_ops->kvs_put() signature */ static int exchange_kvs_put (void *arg, - const char *kvsname, - const char *key, - const char *val) + const char *kvsname, + const char *key, + const char *val) { struct shell_pmi *pmi = arg; @@ -369,7 +370,8 @@ static void pmi_fd_cb (flux_shell_task_t *task, len = flux_subprocess_read_line (task->proc, "PMI_FD", &line); if (len < 0) { shell_trace ("%d: C: pmi read error: %s", - task->rank, flux_strerror (errno)); + task->rank, + flux_strerror (errno)); return; } if (len == 0) { From 3f29b85243292a52ee539bd9987ad7f5ad62b062 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 11:09:39 -0700 Subject: [PATCH 7/9] shell/pmi: fix abort message Problem: when the broker calls PMI the abort function, the shell PMI plugin logs an exception message that calls out "MPI_Abort()" but MPI is not involved. Log "PMI_Abort()" instead. Fix one test that expected the old message. --- src/shell/pmi/pmi.c | 2 +- t/t2602-job-shell.t | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/shell/pmi/pmi.c b/src/shell/pmi/pmi.c index 94d7428f59b5..6b48c8f6523c 100644 --- a/src/shell/pmi/pmi.c +++ b/src/shell/pmi/pmi.c @@ -104,7 +104,7 @@ static void shell_pmi_abort (void *arg, */ flux_shell_raise ("exec", 0, - "MPI_Abort%s%s", + "PMI_Abort%s%s", msg ? ": " : "", msg ? msg : ""); } diff --git a/t/t2602-job-shell.t b/t/t2602-job-shell.t index ae723506b713..c9dff1fca5b4 100755 --- a/t/t2602-job-shell.t +++ b/t/t2602-job-shell.t @@ -137,7 +137,7 @@ test_expect_success 'job-exec: decrease kill timeout for tests' ' test_expect_success 'job-shell: PMI_Abort works' ' ! flux run -N4 -n4 ${PMI_INFO} --abort=1 >abort.log 2>&1 && test_debug "cat abort.log" && - grep "job.exception.*MPI_Abort: Test abort error." abort.log + grep "job.exception.*PMI_Abort: Test abort error." abort.log ' test_expect_success 'job-shell: create expected I/O output' ' ${LPTEST} | sed -e "s/^/0: /" >lptest.exp && From 74cc37b55ed0f4d79f96b0d9d005861ee225310c Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 11:52:42 -0700 Subject: [PATCH 8/9] testsuite: cover flux pmi barrier --abort Problem: there is no coverage for abort in some of the upmi implementations. Add tests. --- t/t3002-pmi.t | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/t/t3002-pmi.t b/t/t3002-pmi.t index 9d27e49e6577..02afb2ca480b 100755 --- a/t/t3002-pmi.t +++ b/t/t3002-pmi.t @@ -175,6 +175,11 @@ test_expect_success 'flux-pmi barrier --count works' ' flux run --label-io -n2 \ flux pmi barrier --count=2 ' +test_expect_success 'flux-pmi barrier --abort works' ' + test_expect_code 143 flux run --label-io -n2 \ + flux pmi barrier --abort=1 2>barrier_abort.err && + grep -i abort barrier_abort.err +' test_expect_success 'flux-pmi exchange works' ' flux run --label-io -n2 \ flux pmi exchange @@ -238,6 +243,10 @@ test_expect_success 'flux-pmi --method=libpmi barrier works w/ flux libpmi.so' ' flux run -n2 bash -c "\ flux pmi -v --method=libpmi:$(cat libpmi) barrier" ' +test_expect_success 'flux-pmi --method=libpmi barrier abort works w/ flux libpmi.so' ' + test_expect_code 143 flux run -n2 bash -c "\ + flux pmi -v --method=libpmi:$(cat libpmi) barrier --abort 1" +' test_expect_success 'flux-pmi --method=libpmi exchange works w/ flux libpmi.so' ' flux run -n2 bash -c "\ flux pmi -v --method=libpmi:$(cat libpmi) exchange" @@ -268,6 +277,10 @@ test_expect_success 'flux-pmi --method=libpmi2 barrier works w/ flux pmi lib' ' flux run -n2 bash -c "\ flux pmi -v --method=libpmi2:$(cat libpmi2) barrier" ' +test_expect_success 'flux-pmi --method=libpmi2 barrier works w/ flux pmi lib' ' + test_expect_code 143 flux run -n2 bash -c "\ + flux pmi -v --method=libpmi2:$(cat libpmi2) barrier --abort 1" +' test_expect_success 'flux-pmi --method=libpmi2 exchange works w/ flux pmi lib' ' flux run -n2 bash -c "\ flux pmi -v --method=libpmi2:$(cat libpmi2) exchange" From fba4dee6e647edb81afbc3b7ac9c151da941265e Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 12 Sep 2024 11:46:26 -0700 Subject: [PATCH 9/9] flux-pmi(1): document barrier --abort=RANK Problem: the barrier --abort=RANK option has no documentation. Add it to the man page. --- doc/man1/flux-pmi.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/man1/flux-pmi.rst b/doc/man1/flux-pmi.rst index b87eaac51009..481f643d10d0 100644 --- a/doc/man1/flux-pmi.rst +++ b/doc/man1/flux-pmi.rst @@ -88,6 +88,11 @@ barrier Execute N barrier (step 2) operations (default 1). +.. option:: --abort=RANK + + Instead of entering the barrier, arrange for RANK to call the PMI + abort function. + exchange --------