From bf2463c6a1d95e5a281ff324f0fc6416aa28e7b3 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Tue, 8 Jan 2019 16:18:13 -0600 Subject: [PATCH 1/4] Log: libcrmcommon: downgrade empty output logging to trace level nothing is not very interesting, so reduce clutter --- lib/common/logging.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/common/logging.c b/lib/common/logging.c index bfd82b7b1bb..ff5ae749d88 100644 --- a/lib/common/logging.c +++ b/lib/common/logging.c @@ -997,7 +997,7 @@ crm_log_output_fn(const char *file, const char *function, int line, int level, c const char *offset = NULL; if (output == NULL) { - level = LOG_DEBUG; + level = LOG_TRACE; output = "-- empty --"; } From 490ef9f6ed30427b3617236c514dd076ffd0e88f Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Thu, 10 Jan 2019 14:36:26 -0600 Subject: [PATCH 2/4] Low: resources: clean serialized file on SIGTERM in Dummy Otherwise it could give a false probe error at next start, confusing whatever else is being tested with a dummy resource. Unfortunately this doesn't help if an in-flight monitor gets cancelled with a SIGKILL, but there's no obvious solution there. --- extra/resources/Dummy | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extra/resources/Dummy b/extra/resources/Dummy index faa0e08ad05..75e4cf5b238 100755 --- a/extra/resources/Dummy +++ b/extra/resources/Dummy @@ -114,6 +114,10 @@ END trap sigterm_handler TERM sigterm_handler() { ocf_log info "They use TERM to bring us down. No such luck." + + # Since we're likely going to get KILLed, clean up any monitor + # serialization in progress, so the next probe doesn't return an error. + rm -f "${VERIFY_SERIALIZED_FILE}" return } @@ -171,6 +175,7 @@ dummy_monitor() { # two monitor ops have occurred at the same time. # This verifies a condition in pacemaker-execd regression tests. ocf_log err "$VERIFY_SERIALIZED_FILE exists already" + ocf_exit_reason "alternate universe collision" return $OCF_ERR_GENERIC fi From fe7172ff757996300c41d3d382d050a69c944bfc Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Tue, 8 Jan 2019 15:31:14 -0600 Subject: [PATCH 3/4] Fix: controller: directly acknowledge unrecordable operation results Regression introduced in 2.0.1-rc1 by 0363985dd Before that commit, if an operation result arrived when there was no resource information available, a warning would be logged and the operation would be directly acknowledged. This could occur, for example, if resource history were cleaned while an operation was pending on that resource. After that commit, in that situation, an assertion and error would be logged, and no acknowledgement would be sent, leading to a transition timeout. Restore the direct ack. Also improve related log messages. --- daemons/controld/controld_execd.c | 80 +++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c index f7c5cded13a..26fcced357b 100644 --- a/daemons/controld/controld_execd.c +++ b/daemons/controld/controld_execd.c @@ -2483,6 +2483,7 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, int update_id = 0; gboolean remove = FALSE; gboolean removed = FALSE; + bool need_direct_ack = FALSE; lrmd_rsc_info_t *rsc = NULL; const char *node_name = NULL; @@ -2513,7 +2514,6 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, op_key, op->rsc_id); } } - CRM_LOG_ASSERT(rsc != NULL); // If it's still NULL, there's a bug somewhere // Get node name if available (from executor state or action XML) if (lrm_state) { @@ -2545,51 +2545,81 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, } if (op->op_status != PCMK_LRM_OP_CANCELLED) { + /* We might not record the result, so directly acknowledge it to the + * originator instead, so it doesn't time out waiting for the result + * (especially important if part of a transition). + */ + need_direct_ack = TRUE; + if (controld_action_is_recordable(op->op_type)) { if (node_name && rsc) { + // We should record the result, and happily, we can update_id = do_update_resource(node_name, rsc, op); + need_direct_ack = FALSE; + + } else if (op->rsc_deleted) { + /* We shouldn't record the result (likely the resource was + * refreshed, cleaned, or removed while this operation was + * in flight). + */ + crm_notice("Not recording %s result in CIB because " + "resource information was removed since it was initiated", + op_key); } else { - // @TODO Should we direct ack? - crm_err("Unable to record %s result in CIB: %s", - op_key, + /* This shouldn't be possible; the executor didn't consider the + * resource deleted, but we couldn't find resource or node + * information. + */ + crm_err("Unable to record %s result in CIB: %s", op_key, (node_name? "No resource information" : "No node name")); } - } else { - send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } + } else if (op->interval_ms == 0) { - /* This will occur when "crm resource cleanup" is called while actions are in-flight */ - crm_err("Op %s (call=%d): Cancelled", op_key, op->call_id); - send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); + /* A non-recurring operation was cancelled. Most likely, the + * never-initiated action was removed from the executor's pending + * operations list upon resource removal. + */ + need_direct_ack = TRUE; } else if (pending == NULL) { - /* We don't need to do anything for cancelled ops - * that are not in our pending op list. There are no - * transition actions waiting on these operations. */ + /* This recurring operation was cancelled, but was not pending. No + * transition actions are waiting on it, nothing needs to be done. + */ } else if (op->user_data == NULL) { - /* At this point we have a pending entry, but no transition - * key present in the user_data field. report this */ - crm_err("Op %s (call=%d): No user data", op_key, op->call_id); + /* This recurring operation was cancelled and pending, but we don't + * have a transition key. This should never happen. + */ + crm_err("Recurring operation %s was cancelled without transition information", + op_key); } else if (pending->remove) { - /* The tengine canceled this op, we have been waiting for the cancel to finish. */ + /* This recurring operation was cancelled (by us) and pending, and we + * have been waiting for it to finish. + */ if (lrm_state) { erase_lrm_history_by_op(lrm_state, op); } } else if (op->rsc_deleted) { - /* The tengine initiated this op, but it was cancelled outside of the - * tengine's control during a resource cleanup/re-probe request. The tengine - * must be alerted that this operation completed, otherwise the tengine - * will continue waiting for this update to occur until it is timed out. - * We don't want this update going to the cib though, so use a direct ack. */ - crm_trace("Op %s (call=%d): cancelled due to rsc deletion", op_key, op->call_id); - send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); + /* This recurring operation was cancelled (but not by us, and the + * executor does not have resource information, likely due to resource + * cleanup, refresh, or removal) and pending. + */ + crm_debug("Recurring op %s was cancelled due to resource deletion", + op_key); + need_direct_ack = TRUE; } else { - /* Before a stop is called, no need to direct ack */ - crm_trace("Op %s (call=%d): no delete event required", op_key, op->call_id); + /* This recurring operation was cancelled (but not by us, likely by the + * executor before stopping the resource) and pending. We don't need to + * do anything special. + */ + } + + if (need_direct_ack) { + send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } if(remove == FALSE) { From cf64fdd8c842a365f90a28cdf3f374a4ba1e62c2 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Thu, 10 Jan 2019 15:10:50 -0600 Subject: [PATCH 4/4] Doc: ChangeLog: update for 2.0.1-rc3 release --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index e3483336d69..ea65fbec37e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +* Thu Jan 10 2019 Ken Gaillot Pacemaker-2.0.1-rc3 +- Changesets: 27 +- Diff: 20 files changed, 375 insertions(+), 195 deletions(-) + +- Changes since Pacemaker-2.0.1-rc2 + + attrd: start new election immediately if writer is lost + + attrd: detect alert configuration changes when CIB is entirely replaced + + controller: avoid transition timeout if resource cleaned while operation + is in-flight (regression in 2.0.1-rc1) + + libstonithd: restore C++ compatibility (regression in 2.0.1-rc1) + + tools: fix crm_resource --clear when lifetime was used with ban/move + + tools: fix crm_resource --move when lifetime was used with previous move + * Wed Dec 19 2018 Ken Gaillot Pacemaker-2.0.1-rc2 - Changesets: 12 - Diff: 2 files changed, 6 insertions(+), 2 deletions(-)