diff --git a/ChangeLog b/ChangeLog index ad73fb5dd0d..93914211ef9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,41 @@ +* Tue Oct 22 2019 Ken Gaillot Pacemaker-1.1.22 +- Changesets: 198 +- Diff: 95 files changed, 3047 insertions(+), 1245 deletions(-) + +- Features added since Pacemaker-1.1.21 + + crmd: new 'fence-reaction' cluster option specifies whether local node + should 'stop' or 'panic' if notified of own fencing + + Pacemaker Remote: allow file for environment variables when used in bundle + + Pacemaker Remote: allow configurable listen address and TLS priorities + + tools: crm_simulate --repeat option to repeat profiling tests + + tools: new pcmk_simtimes tool to compare crm_simulate profiling output + +- Changes since Pacemaker-1.1.21 + + fencing: do not block concurrent fencing actions on a device + (regression since 1.1.21) + + crmd: set timeout on scheduler responses to avoid infinite wait + + crmd: confirm cancel of failed monitors, to avoid transition timeout + + lrmd: let controller cancel monitors, to avoid transition timeout + + lrmd: return error for stonith probes if stonith connection was lost + + fencing: ensure concurrent fencing commands always get triggered to execute + + fencing: fail pending actions and re-sync history after crash and restart + + fencing: don't let command with long delay block other pending commands + + fencing: allow functioning even if CIB updates arrive unceasingly + + pengine: calculate secure digests for unfencing, for replaying saved CIBs + + pengine: properly detect dangling migrations, to avoid restart loop + + pengine: avoid delay in recovery of failed remote connections + + pengine: avoid scheduling actions on remote node that is shutting down + + pengine: wait for probe actions to complete to prevent unnecessary + restart/re-promote of dependent resources + + libcrmcommon: avoid possible use-of-NULL when applying XML diffs + + libcrmcommon: correctly apply XML diffs with multiple move/create changes + + libcrmcommon: return error when applying XML diffs containing unknown operations + + tools: fail if tar is not available when running crm_report + + tools: correct crm_report argument parsing + + tools: crm_report: don't ignore log if unrelated file is too large + + agents: calculate #health_disk correctly in SysInfo + + agents: handle run-as-user properly in ClusterMon + * Thu Jun 6 2019 Ken Gaillot Pacemaker-1.1.21 - Changesets: 64 - Diff: 39 files changed, 1985 insertions(+), 843 deletions(-) diff --git a/cts/CTStests.py b/cts/CTStests.py index f328170c9c6..1225c3c8756 100644 --- a/cts/CTStests.py +++ b/cts/CTStests.py @@ -516,7 +516,7 @@ def errorstoignore(self): self.templates["Pat:Fencing_start"] % ".*", self.templates["Pat:Fencing_ok"] % ".*", r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery", - r"error.*: Operation reboot of .*by .* for stonith_admin.*: Timer expired", + r"error.*: Operation 'reboot' targeting .* on .* for stonith_admin.*: Timer expired", ] def is_applicable(self): diff --git a/cts/patterns.py b/cts/patterns.py index 87418d4a70e..e908714181d 100644 --- a/cts/patterns.py +++ b/cts/patterns.py @@ -59,8 +59,8 @@ def __init__(self, name): "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", - "Pat:Fencing_start" : r"(Initiating remote operation|Requesting peer fencing ).* (for|of) %s", - "Pat:Fencing_ok" : r"stonith.*:\s*Operation .* of %s by .* for .*@.*: OK", + "Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s", + "Pat:Fencing_ok" : r"stonith.*:\s*Operation .* targeting %s on .* for .*@.*: OK", "Pat:Fencing_recover" : r"pengine.*: Recover %s", "Pat:Fencing_active" : r"pengine.*: Resource %s is active on .* nodes", "Pat:Fencing_probe" : r"crmd.*: Result of probe operation for %s on .*: Error", diff --git a/fencing/regression.py.in b/fencing/regression.py.in index aaebcc48f57..28a4e2fca1c 100644 --- a/fencing/regression.py.in +++ b/fencing/regression.py.in @@ -5,7 +5,7 @@ # Pacemaker targets compatibility with Python 2.6+ and 3.2+ from __future__ import print_function, unicode_literals, absolute_import, division -__copyright__ = "Copyright (C) 2012-2016 Andrew Beekhof " +__copyright__ = "Copyright 2012-2019 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import io @@ -485,8 +485,8 @@ class Tests(object): ### one merger will happen test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") ### Test that multiple mergers occur test = self.new_test("cpg_custom_merge_multiple", @@ -506,11 +506,11 @@ class Tests(object): test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") ### Test that multiple mergers occur with topologies used test = self.new_test("cpg_custom_merge_with_topology", @@ -533,11 +533,11 @@ class Tests(object): test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client") test.add_stonith_log_pattern("Merging stonith action 'off' targeting node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") - test.add_stonith_log_pattern("Operation off of node3 by") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") + test.add_stonith_log_pattern("Operation 'off' targeting node3 on") def build_fence_no_merge_tests(self): """ Register tests to verify when fence operations should not be merged """ @@ -1130,16 +1130,16 @@ class Tests(object): """-o "pcmk_off_timeout=2" -o "pcmk_reboot_timeout=20" """) test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", "-B node_fake -t 5") - test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") + test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30) - test.add_stonith_log_pattern("Total timeout set to 3 for peer's fencing of node_fake") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'true2'") - test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on") + test.add_stonith_log_pattern("Total timeout set to 3 for peer's fencing targeting node_fake") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true2'") + test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") # fence_dummy sets "on" as an on_target action - test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake") - test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake") - test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") + test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") + test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") + test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") test = self.new_test("cpg_remap_automatic", "Verify remapped topology reboot skips automatic 'on'", 1) @@ -1151,12 +1151,12 @@ class Tests(object): """-o "mode=pass" -o "pcmk_host_list=node_fake" """) test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1 -v true2") test.add_cmd("stonith_admin", "-B node_fake -t 5") - test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'true2'") - test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on") - test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") - test.add_stonith_neg_log_pattern("perform op 'node_fake on' with") + test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true2'") + test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") + test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") + test.add_stonith_neg_log_pattern("perform 'on' action targeting node_fake using") test.add_stonith_neg_log_pattern("'on' failure") test = self.new_test("cpg_remap_complex_1", @@ -1168,14 +1168,14 @@ class Tests(object): test.add_cmd("stonith_admin", "-r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true1 -v true2") test.add_cmd("stonith_admin", "-B node_fake -t 5") - test.add_stonith_log_pattern("perform op 'node_fake reboot' with 'false1'") - test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'true2'") - test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on") - test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake") - test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake") - test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") + test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using 'false1'") + test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true2'") + test.add_stonith_log_pattern("Remapped 'off' targeting node_fake complete, remapping to 'on'") + test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) targeting node_fake") + test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) targeting node_fake") + test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") test = self.new_test("cpg_remap_complex_2", "Verify remapped topology reboot failure in second level proceeds to third level", @@ -1189,13 +1189,13 @@ class Tests(object): test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true1 -v false2 -v true3") test.add_cmd("stonith_admin", "-r node_fake -i 3 -v true2") test.add_cmd("stonith_admin", "-B node_fake -t 5") - test.add_stonith_log_pattern("perform op 'node_fake reboot' with 'false1'") - test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'true1'") - test.add_stonith_log_pattern("perform op 'node_fake off' with 'false2'") + test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using 'false1'") + test.add_stonith_log_pattern("Remapping multiple-device reboot targeting node_fake") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'true1'") + test.add_stonith_log_pattern("perform 'off' action targeting node_fake using 'false2'") test.add_stonith_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times") - test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") - test.add_stonith_log_pattern("perform op 'node_fake reboot' with 'true2'") + test.add_stonith_log_pattern("Undoing remap of reboot targeting node_fake") + test.add_stonith_log_pattern("perform 'reboot' action targeting node_fake using 'true2'") test.add_stonith_neg_log_pattern("node_fake with true3") def setup_environment(self, use_corosync): diff --git a/fencing/remote.c b/fencing/remote.c index 6c5b9b812d7..17df700c105 100644 --- a/fencing/remote.c +++ b/fencing/remote.c @@ -300,7 +300,7 @@ op_requested_action(const remote_fencing_op_t *op) static void op_phase_off(remote_fencing_op_t *op) { - crm_info("Remapping multiple-device reboot of %s (%s) to off", + crm_info("Remapping multiple-device reboot targeting %s (%s) to 'off'", op->target, op->id); op->phase = st_phase_off; @@ -321,7 +321,8 @@ op_phase_on(remote_fencing_op_t *op) { GListPtr iter = NULL; - crm_info("Remapped off of %s complete, remapping to on for %s.%.8s", + crm_info("Remapped 'off' targeting %s complete, " + "remapping to 'on' for %s.%.8s", op->target, op->client_name, op->id); op->phase = st_phase_on; strcpy(op->action, "on"); @@ -354,7 +355,7 @@ static void undo_op_remap(remote_fencing_op_t *op) { if (op->phase > 0) { - crm_info("Undoing remap of reboot of %s for %s.%.8s", + crm_info("Undoing remap of reboot targeting %s for %s.%.8s", op->target, op->client_name, op->id); op->phase = st_phase_requested; strcpy(op->action, "reboot"); @@ -492,9 +493,12 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) undo_op_remap(op); if (op->notify_sent == TRUE) { - crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s", - op->action, op->target, op->delegate ? op->delegate : "", - op->client_name, op->originator, op->id, op->state, pcmk_strerror(rc)); + crm_err("Already sent notifications for '%s' targeting %s on %s for " + "client %s@%s.%.8s: %s " CRM_XS " rc=%d state=%d", + op->action, op->target, + (op->delegate? op->delegate : "unknown node"), + op->client_name, op->originator, op->id, pcmk_strerror(rc), + rc, op->state); goto remote_op_done_cleanup; } @@ -528,9 +532,10 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) level = LOG_NOTICE; } - do_crm_log(level, - "Operation %s of %s by %s for %s@%s.%.8s: %s", - op->action, op->target, op->delegate ? op->delegate : "", + do_crm_log(level, "Operation '%s'%s%s on %s for %s@%s.%.8s: %s", + op->action, (op->target? " targeting " : ""), + (op->target? op->target : ""), + (op->delegate? op->delegate : ""), op->client_name, op->originator, op->id, pcmk_strerror(rc)); handle_local_reply_and_notify(op, data, rc); @@ -577,8 +582,8 @@ remote_op_timeout_one(gpointer userdata) op->op_timer_one = 0; - crm_notice("Peer's fencing (%s) of %s for %s timed out" CRM_XS "id=%s", - op->action, op->target, op->client_name, op->id); + crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS + " id=%s", op->action, op->target, op->client_name, op->id); call_remote_stonith(op, NULL); return FALSE; } @@ -591,13 +596,15 @@ remote_op_timeout(gpointer userdata) op->op_timer_total = 0; if (op->state == st_done) { - crm_debug("Action %s (%s) for %s (%s) already completed", - op->action, op->id, op->target, op->client_name); + crm_debug("Action '%s' targeting %s for client %s already completed " + CRM_XS " id=%s", + op->action, op->target, op->client_name, op->id); return FALSE; } - crm_debug("Action %s (%s) for %s (%s) timed out", - op->action, op->id, op->target, op->client_name); + crm_debug("Action '%s' targeting %s for client %s timed out " + CRM_XS " id=%s", + op->action, op->target, op->client_name, op->id); if (op->phase == st_phase_on) { /* A remapped reboot operation timed out in the "on" phase, but the @@ -622,14 +629,18 @@ remote_op_query_timeout(gpointer data) op->query_timer = 0; if (op->state == st_done) { - crm_debug("Operation %s for %s already completed", op->id, op->target); + crm_debug("Operation %s targeting %s already completed", + op->id, op->target); } else if (op->state == st_exec) { - crm_debug("Operation %s for %s already in progress", op->id, op->target); + crm_debug("Operation %s targeting %s already in progress", + op->id, op->target); } else if (op->query_results) { - crm_debug("Query %s for %s complete: %d", op->id, op->target, op->state); + crm_debug("Query %s targeting %s complete (state=%d)", + op->id, op->target, op->state); call_remote_stonith(op, NULL); } else { - crm_debug("Query %s for %s timed out: %d", op->id, op->target, op->state); + crm_debug("Query %s targeting %s timed out (state=%d)", + op->id, op->target, op->state); if (op->op_timer_total) { g_source_remove(op->op_timer_total); op->op_timer_total = 0; @@ -818,7 +829,8 @@ stonith_topology_next(remote_fencing_op_t * op) } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL); if (op->level < ST_LEVEL_MAX) { - crm_trace("Attempting fencing level %d for %s (%d devices) - %s@%s.%.8s", + crm_trace("Attempting fencing level %d targeting %s (%d devices) " + "for client %s@%s.%.8s", op->level, op->target, g_list_length(tp->levels[op->level]), op->client_name, op->originator, op->id); set_op_device_list(op, tp->levels[op->level]); @@ -834,7 +846,7 @@ stonith_topology_next(remote_fencing_op_t * op) return pcmk_ok; } - crm_notice("All fencing options to fence %s for %s@%s.%.8s failed", + crm_notice("All fencing options targeting %s for client %s@%s.%.8s failed", op->target, op->client_name, op->originator, op->id); return -EINVAL; } @@ -877,13 +889,15 @@ merge_duplicates(remote_fencing_op_t * op) peer = crm_get_peer(0, other->originator); if(fencing_peer_active(peer) == FALSE) { - crm_notice("Failing stonith action %s for node %s originating from %s@%s.%.8s: Originator is dead", + crm_notice("Failing action '%s' targeting %s originating from " + "client %s@%s.%.8s: Originator is dead", other->action, other->target, other->client_name, other->originator, other->id); other->state = st_failed; continue; } else if(other->total_timeout > 0 && now > (other->total_timeout + other->created)) { - crm_info("Stonith action %s for node %s originating from %s@%s.%.8s is too old: %d vs. %d + %d", + crm_info("Action '%s' targeting %s originating from client " + "%s@%s.%.8s is too old: %ld vs. %ld + %d", other->action, other->target, other->client_name, other->originator, other->id, now, other->created, other->total_timeout); continue; @@ -898,10 +912,11 @@ merge_duplicates(remote_fencing_op_t * op) other->total_timeout = op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL); } - crm_notice - ("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)", - op->action, op->target, op->client_name, op->id, other->client_name, other->originator, - other->id, other->total_timeout); + crm_notice("Merging stonith action '%s' targeting %s originating from " + "client %s.%.8s with identical request from %s@%s.%.8s (%ds)", + op->action, op->target, op->client_name, op->id, + other->client_name, other->originator, other->id, + other->total_timeout); report_timeout_period(op, other->total_timeout); op->state = st_duplicate; } @@ -1014,9 +1029,9 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid)); - crm_trace("%s new stonith op: %s - %s of %s for %s", - (peer - && dev) ? "Recorded" : "Generated", op->id, op->action, op->target, op->client_name); + crm_trace("%s new stonith op %s ('%s' targeting %s for client %s)", + (peer && dev)? "Recorded" : "Generated", op->id, op->action, + op->target, op->client_name); if (op->call_options & st_opt_cs_nodeid) { int nodeid = crm_atoi(op->target, NULL); @@ -1079,18 +1094,18 @@ initiate_remote_stonith_op(crm_client_t * client, xmlNode * request, gboolean ma switch (op->state) { case st_failed: - crm_warn("Could not request peer fencing (%s) of %s " + crm_warn("Could not request peer fencing (%s) targeting %s " CRM_XS " id=%s", op->action, op->target, op->id); remote_op_done(op, NULL, -EINVAL, FALSE); return op; case st_duplicate: - crm_info("Requesting peer fencing (%s) of %s (duplicate) " + crm_info("Requesting peer fencing (%s) targeting %s (duplicate) " CRM_XS " id=%s", op->action, op->target, op->id); return op; default: - crm_notice("Requesting peer fencing (%s) of %s " + crm_notice("Requesting peer fencing (%s) targeting %s " CRM_XS " id=%s state=%d", op->action, op->target, op->id, op->state); } @@ -1137,7 +1152,7 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer for (iter = op->query_results; iter != NULL; iter = iter->next) { st_query_result_t *peer = iter->data; - crm_trace("Testing result from %s for %s with %d devices: %d %x", + crm_trace("Testing result from %s targeting %s with %d devices: %d %x", peer->host, op->target, peer->ndevices, peer->tried, options); if ((options & FIND_PEER_SKIP_TARGET) && safe_str_eq(peer->host, op->target)) { continue; @@ -1423,12 +1438,13 @@ advance_op_topology(remote_fencing_op_t *op, const char *device, xmlNode *msg, if (op->devices) { /* Necessary devices remain, so execute the next one */ - crm_trace("Next for %s on behalf of %s@%s (rc was %d)", + crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)", op->target, op->originator, op->client_name, rc); call_remote_stonith(op, NULL); } else { /* We're done with all devices and phases, so finalize operation */ - crm_trace("Marking complex fencing op for %s as complete", op->target); + crm_trace("Marking complex fencing op targeting %s as complete", + op->target); op->state = st_done; remote_op_done(op, msg, rc, FALSE); } @@ -1451,7 +1467,7 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout; op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op); report_timeout_period(op, op->total_timeout); - crm_info("Total timeout set to %d for peer's fencing of %s for %s" + crm_info("Total timeout set to %d for peer's fencing targeting %s for %s" CRM_XS "id=%s", total_timeout, op->target, op->client_name, op->id); } @@ -1482,17 +1498,20 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) if (device) { timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_device_timeout(op, peer, device); - crm_info("Requesting that '%s' perform op '%s %s' with '%s' for %s (%ds)", peer->host, - op->target, op->action, device, op->client_name, timeout_one); + crm_notice("Requesting that %s perform '%s' action targeting %s " + "using '%s' " CRM_XS " for client %s (%ds)", + peer->host, op->action, op->target, device, + op->client_name, timeout_one); crm_xml_add(remote_op, F_STONITH_DEVICE, device); crm_xml_add(remote_op, F_STONITH_MODE, "slave"); } else { timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer); - crm_info("Requesting that '%s' perform op '%s %s' for %s (%ds, %ds)", - peer->host, op->target, op->action, op->client_name, timeout_one, stonith_watchdog_timeout_ms); + crm_notice("Requesting that %s perform '%s' action targeting %s " + CRM_XS " for client %s (%ds, %lds)", + peer->host, op->action, op->target, op->client_name, + timeout_one, stonith_watchdog_timeout_ms); crm_xml_add(remote_op, F_STONITH_MODE, "smart"); - } op->state = st_exec; @@ -1501,18 +1520,18 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) } if(stonith_watchdog_timeout_ms > 0 && device && safe_str_eq(device, "watchdog")) { - crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)", - stonith_watchdog_timeout_ms/1000, op->target, - op->action, op->client_name, op->id, device); + crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s", + stonith_watchdog_timeout_ms/1000, op->target, op->action, + op->client_name, op->id); op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); /* TODO check devices to verify watchdog will be in use */ } else if(stonith_watchdog_timeout_ms > 0 && safe_str_eq(peer->host, op->target) && safe_str_neq(op->action, "on")) { - crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)", - stonith_watchdog_timeout_ms/1000, op->target, - op->action, op->client_name, op->id, device); + crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s", + stonith_watchdog_timeout_ms/1000, op->target, op->action, + op->client_name, op->id); op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); } else { @@ -1529,19 +1548,20 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) /* A remapped "on" cannot be executed, but the node was already * turned off successfully, so ignore the error and continue. */ - crm_warn("Ignoring %s 'on' failure (no capable peers) for %s after successful 'off'", - device, op->target); + crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s " + "after successful 'off'", device, op->target); advance_op_topology(op, device, NULL, pcmk_ok); return; } else if (op->owner == FALSE) { - crm_err("Fencing (%s) of %s for %s is not ours to control", + crm_err("Fencing (%s) targeting %s for client %s is not ours to control", op->action, op->target, op->client_name); } else if (op->query_timer == 0) { /* We've exhausted all available peers */ - crm_info("No remaining peers capable of fencing (%s) %s for %s (%d)", - op->target, op->action, op->client_name, op->state); + crm_info("No remaining peers capable of fencing (%s) %s for client %s " + CRM_XS " state=%d", + op->action, op->target, op->client_name, op->state); CRM_LOG_ASSERT(op->state < st_done); remote_op_timeout(op); @@ -1553,35 +1573,36 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) * are available to execute the fencing operation. */ if(stonith_watchdog_timeout_ms && (device == NULL || safe_str_eq(device, "watchdog"))) { - crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)", + crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s", stonith_watchdog_timeout_ms/1000, op->target, - op->action, op->client_name, op->id, device); + op->action, op->client_name, op->id); op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); return; } if (op->state == st_query) { - crm_info("No peers (out of %d) have devices capable of fencing (%s) %s for %s (%d)", - op->replies, op->action, op->target, op->client_name, - op->state); + crm_info("No peers (out of %d) have devices capable of fencing " + "(%s) %s for client %s " CRM_XS " state=%d", + op->replies, op->action, op->target, op->client_name, + op->state); rc = -ENODEV; } else { - crm_info("No peers (out of %d) are capable of fencing (%s) %s for %s (%d)", - op->replies, op->action, op->target, op->client_name, - op->state); + crm_info("No peers (out of %d) are capable of fencing (%s) %s " + "for client %s " CRM_XS " state=%d", + op->replies, op->action, op->target, op->client_name, + op->state); } op->state = st_failed; remote_op_done(op, NULL, rc, FALSE); - } else if (device) { - crm_info("Waiting for additional peers capable of fencing (%s) %s with %s for %s.%.8s", - op->action, op->target, device, op->client_name, op->id); } else { - crm_info("Waiting for additional peers capable of fencing (%s) %s for %s%.8s", - op->action, op->target, op->client_name, op->id); + crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " + "for client %s%.8s", + op->action, op->target, (device? " with " : ""), + (device? device : ""), op->client_name, op->id); } } @@ -1974,9 +1995,10 @@ process_remote_stonith_exec(xmlNode * msg) if (is_set(op->call_options, st_opt_topology)) { const char *device = crm_element_value(msg, F_STONITH_DEVICE); - crm_notice("Call to %s for '%s %s' on behalf of %s@%s: %s (%d)", - device, op->target, op->action, op->client_name, op->originator, - pcmk_strerror(rc), rc); + crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s " + CRM_XS " rc=%d", + op->action, op->target, device, op->client_name, + op->originator, pcmk_strerror(rc), rc); /* We own the op, and it is complete. broadcast the result to all nodes * and notify our local clients. */ @@ -1989,8 +2011,8 @@ process_remote_stonith_exec(xmlNode * msg) /* A remapped "on" failed, but the node was already turned off * successfully, so ignore the error and continue. */ - crm_warn("Ignoring %s 'on' failure (exit code %d) for %s after successful 'off'", - device, rc, op->target); + crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s " + "after successful 'off'", device, rc, op->target); rc = pcmk_ok; } diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h index a350de292c5..0d225f5d849 100644 --- a/include/crm/common/internal.h +++ b/include/crm/common/internal.h @@ -23,6 +23,7 @@ #include /* for gboolean */ #include /* for struct dirent */ #include /* for getpid() */ +#include /* for bool */ #include /* for uid_t and gid_t */ #include @@ -42,6 +43,8 @@ char *crm_read_contents(const char *filename); int crm_write_sync(int fd, const char *contents); int crm_set_nonblocking(int fd); +void pcmk__close_fds_in_child(bool); + /* internal procfs utilities (from procfs.c) */ diff --git a/lib/common/Makefile.am b/lib/common/Makefile.am index bb755cc0a1c..a5e56c5348a 100644 --- a/lib/common/Makefile.am +++ b/lib/common/Makefile.am @@ -34,7 +34,7 @@ CFLAGS = $(CFLAGS_COPY:-Wcast-qual=) -fPIC noinst_HEADERS = crmcommon_private.h -libcrmcommon_la_LDFLAGS = -version-info 14:0:11 +libcrmcommon_la_LDFLAGS = -version-info 15:0:12 libcrmcommon_la_CFLAGS = $(CFLAGS_HARDENED_LIB) libcrmcommon_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) diff --git a/lib/common/io.c b/lib/common/io.c index e296ff3ff3f..b5ba0ae5d11 100644 --- a/lib/common/io.c +++ b/lib/common/io.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -518,3 +519,74 @@ crm_set_nonblocking(int fd) } return pcmk_ok; } + +/*! + * \internal + * \brief Close open file descriptors + * + * Close all file descriptors (except optionally stdin, stdout, and stderr), + * which is a best practice for a new child process forked for the purpose of + * executing an external program. + * + * \param[in] bool If true, close stdin, stdout, and stderr as well + */ +void +pcmk__close_fds_in_child(bool all) +{ + DIR *dir; + struct rlimit rlim; + rlim_t max_fd; + int min_fd = (all? 0 : (STDERR_FILENO + 1)); + + /* Find the current process's (soft) limit for open files. getrlimit() + * should always work, but have a fallback just in case. + */ + if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { + max_fd = rlim.rlim_cur - 1; + } else { + long conf_max = sysconf(_SC_OPEN_MAX); + + max_fd = (conf_max > 0)? conf_max : 1024; + } + + /* /proc/self/fd (on Linux) or /dev/fd (on most OSes) contains symlinks to + * all open files for the current process, named as the file descriptor. + * Use this if available, because it's more efficient than a shotgun + * approach to closing descriptors. + */ +#if SUPPORT_PROCFS + dir = opendir("/proc/self/fd"); + if (dir == NULL) { + dir = opendir("/dev/fd"); + } +#else + dir = opendir("/dev/fd"); +#endif + if (dir != NULL) { + struct dirent *entry; + int dir_fd = dirfd(dir); + + while ((entry = readdir(dir)) != NULL) { + int lpc = atoi(entry->d_name); + + /* How could one of these entries be higher than max_fd, you ask? + * It isn't possible in normal operation, but when run under + * valgrind, valgrind can open high-numbered file descriptors for + * its own use that are higher than the process's soft limit. + * These will show up in the fd directory but aren't closable. + */ + if ((lpc >= min_fd) && (lpc <= max_fd) && (lpc != dir_fd)) { + close(lpc); + } + } + closedir(dir); + return; + } + + /* If no fd directory is available, iterate over all possible descriptors. + * This is less efficient due to the overhead of many system calls. + */ + for (int lpc = max_fd; lpc >= min_fd; lpc--) { + close(lpc); + } +} diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am index abc2d50253d..43c882ce371 100644 --- a/lib/fencing/Makefile.am +++ b/lib/fencing/Makefile.am @@ -9,7 +9,7 @@ include $(top_srcdir)/Makefile.common lib_LTLIBRARIES = libstonithd.la -libstonithd_la_LDFLAGS = -version-info 6:1:4 +libstonithd_la_LDFLAGS = -version-info 7:0:5 libstonithd_la_CFLAGS = $(CFLAGS_HARDENED_LIB) libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) diff --git a/lib/lrmd/Makefile.am b/lib/lrmd/Makefile.am index 3468da0036a..4e7a5e5d0d8 100644 --- a/lib/lrmd/Makefile.am +++ b/lib/lrmd/Makefile.am @@ -18,7 +18,7 @@ include $(top_srcdir)/Makefile.common lib_LTLIBRARIES = liblrmd.la -liblrmd_la_LDFLAGS = -version-info 6:2:5 +liblrmd_la_LDFLAGS = -version-info 6:3:5 liblrmd_la_CFLAGS = $(CFLAGS_HARDENED_LIB) liblrmd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) diff --git a/lib/pengine/Makefile.am b/lib/pengine/Makefile.am index d565711cf04..aa1f555b887 100644 --- a/lib/pengine/Makefile.am +++ b/lib/pengine/Makefile.am @@ -23,7 +23,7 @@ lib_LTLIBRARIES = libpe_rules.la libpe_status.la ## SOURCES noinst_HEADERS = unpack.h variant.h pe_status_private.h -libpe_rules_la_LDFLAGS = -version-info 5:2:3 +libpe_rules_la_LDFLAGS = -version-info 5:3:3 libpe_rules_la_CFLAGS = $(CFLAGS_HARDENED_LIB) libpe_rules_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) @@ -31,7 +31,7 @@ libpe_rules_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) libpe_rules_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la libpe_rules_la_SOURCES = rules.c rules_alerts.c common.c -libpe_status_la_LDFLAGS = -version-info 16:0:0 +libpe_status_la_LDFLAGS = -version-info 16:1:0 libpe_status_la_CFLAGS = $(CFLAGS_HARDENED_LIB) libpe_status_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) diff --git a/lib/services/Makefile.am b/lib/services/Makefile.am index 4add340e306..4b0c43cc8c5 100644 --- a/lib/services/Makefile.am +++ b/lib/services/Makefile.am @@ -23,7 +23,7 @@ AM_CPPFLAGS = -I$(top_srcdir)/include lib_LTLIBRARIES = libcrmservice.la noinst_HEADERS = pcmk-dbus.h upstart.h systemd.h services_private.h -libcrmservice_la_LDFLAGS = -version-info 7:0:4 +libcrmservice_la_LDFLAGS = -version-info 8:0:5 libcrmservice_la_CPPFLAGS = -DOCF_ROOT_DIR=\"@OCF_ROOT_DIR@\" $(AM_CPPFLAGS) libcrmservice_la_CFLAGS = $(GIO_CFLAGS) diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c index 5b5feea54fc..65a6a7b8cbb 100644 --- a/lib/services/services_linux.c +++ b/lib/services/services_linux.c @@ -443,8 +443,6 @@ services_handle_exec_error(svc_action_t * op, int error) static void action_launch_child(svc_action_t *op) { - int lpc; - /* SIGPIPE is ignored (which is different from signal blocking) by the gnutls library. * Depending on the libqb version in use, libqb may set SIGPIPE to be ignored as well. * We do not want this to be inherited by the child process. By resetting this the signal @@ -474,10 +472,7 @@ action_launch_child(svc_action_t *op) */ setpgid(0, 0); - /* close all descriptors except stdin/out/err and channels to logd */ - for (lpc = getdtablesize() - 1; lpc > STDERR_FILENO; lpc--) { - close(lpc); - } + pcmk__close_fds_in_child(false); #if SUPPORT_CIBSECRETS if (replace_secret_params(op->rsc, op->params) < 0) { diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c index ddfa3c76b6b..05515c83217 100644 --- a/mcp/pacemaker.c +++ b/mcp/pacemaker.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -303,10 +305,8 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL }; static gboolean start_child(pcmk_child_t * child) { - int lpc = 0; uid_t uid = 0; gid_t gid = 0; - struct rlimit oflimits; gboolean use_valgrind = FALSE; gboolean use_callgrind = FALSE; const char *devnull = "/dev/null"; @@ -403,11 +403,7 @@ start_child(pcmk_child_t * child) crm_perror(LOG_ERR, "Could not set user to %d (%s)", uid, child->uid); } - /* Close all open file descriptors */ - getrlimit(RLIMIT_NOFILE, &oflimits); - for (lpc = 0; lpc < oflimits.rlim_cur; lpc++) { - close(lpc); - } + pcmk__close_fds_in_child(true); (void)open(devnull, O_RDONLY); /* Stdin: fd 0 */ (void)open(devnull, O_WRONLY); /* Stdout: fd 1 */ diff --git a/pengine/Makefile.am b/pengine/Makefile.am index 7fbe2852397..b17232a1fd0 100644 --- a/pengine/Makefile.am +++ b/pengine/Makefile.am @@ -59,7 +59,7 @@ endif noinst_HEADERS = allocate.h notif.h utils.h pengine.h -libpengine_la_LDFLAGS = -version-info 16:1:0 +libpengine_la_LDFLAGS = -version-info 16:2:0 libpengine_la_CFLAGS = $(CFLAGS_HARDENED_LIB) libpengine_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) diff --git a/rpm/pacemaker.spec.in b/rpm/pacemaker.spec.in index 9a581fcafa9..dfcb2700547 100644 --- a/rpm/pacemaker.spec.in +++ b/rpm/pacemaker.spec.in @@ -17,7 +17,7 @@ ## Upstream pacemaker version, and its package version (specversion ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) -%global pcmkversion 1.1.21 +%global pcmkversion 1.1.22 %global specversion 1 ## Upstream commit (or git tag, such as "Pacemaker-" plus the diff --git a/tools/crm_mon.c b/tools/crm_mon.c index ee9a3ef642b..65c9c765fb3 100644 --- a/tools/crm_mon.c +++ b/tools/crm_mon.c @@ -3069,57 +3069,40 @@ print_failed_actions(FILE *stream, pe_working_set_t *data_set) static stonith_history_t * reduce_stonith_history(stonith_history_t *history) { - stonith_history_t *new = NULL, *hp, *np, *tmp; + stonith_history_t *new = history, *hp, *np; - for (hp = history; hp; ) { - for (np = new; np; np = np->next) { - if ((hp->state == st_done) || (hp->state == st_failed)) { - /* action not in progress */ - if (safe_str_eq(hp->target, np->target) && - safe_str_eq(hp->action, np->action) && - (hp->state == np->state)) { - if ((hp->state == st_done) || - safe_str_eq(hp->delegate, np->delegate)) { - /* replace or purge */ - if (hp->completed < np->completed) { + if (new) { + hp = new->next; + new->next = NULL; + + while (hp) { + stonith_history_t *hp_next = hp->next; + + hp->next = NULL; + + for (np = new; ; np = np->next) { + if ((hp->state == st_done) || (hp->state == st_failed)) { + /* action not in progress */ + if (safe_str_eq(hp->target, np->target) && + safe_str_eq(hp->action, np->action) && + (hp->state == np->state) && + ((hp->state == st_done) || + safe_str_eq(hp->delegate, np->delegate))) { /* purge older hp */ - tmp = hp->next; - hp->next = NULL; stonith_history_free(hp); - hp = tmp; break; - } - /* damn single linked list */ - free(hp->target); - free(hp->action); - free(np->origin); - np->origin = hp->origin; - free(np->delegate); - np->delegate = hp->delegate; - free(np->client); - np->client = hp->client; - np->completed = hp->completed; - tmp = hp; - hp = hp->next; - free(tmp); - break; } } - if (np->next) { - continue; + + if (!np->next) { + np->next = hp; + break; } } - np = 0; /* let outer loop progress hp */ - break; - } - /* simply move hp from history to new */ - if (np == NULL) { - tmp = hp->next; - hp->next = new; - new = hp; - hp = tmp; + hp = hp_next; } } + return new; } @@ -3207,8 +3190,31 @@ fence_action_str(const char *action) * \param[in] stream File stream to display output to * \param[in] event stonith event */ +static gboolean +is_later_succeeded(stonith_history_t *event, stonith_history_t *top_history) +{ + + gboolean ret = FALSE; + + for (stonith_history_t *prev_hp = top_history; prev_hp; prev_hp = prev_hp->next) { + if (prev_hp == event) { + break; + } + + if ((prev_hp->state == st_done) && + safe_str_eq(event->target, prev_hp->target) && + safe_str_eq(event->action, prev_hp->action) && + safe_str_eq(event->delegate, prev_hp->delegate) && + (event->completed < prev_hp->completed)) { + ret = TRUE; + break; + } + } + return ret; +} + static void -print_stonith_action(FILE *stream, stonith_history_t *event) +print_stonith_action(FILE *stream, stonith_history_t *event, stonith_history_t *top_history) { char *action_s = fence_action_str(event->action); time_t completed = event->completed; @@ -3262,12 +3268,13 @@ print_stonith_action(FILE *stream, stonith_history_t *event) break; case st_failed: print_as("* %s of %s failed: delegate=%s, client=%s, origin=%s,\n" - " %s='%s'\n", + " %s='%s' %s\n", action_s, event->target, event->delegate ? event->delegate : "", event->client, event->origin, fence_full_history?"completed":"last-failed", - run_at_s?run_at_s:""); + run_at_s?run_at_s:"", + is_later_succeeded(event, top_history) ? "(a later attempt succeeded)" : ""); break; default: print_as("* %s of %s pending: client=%s, origin=%s\n", @@ -3290,12 +3297,13 @@ print_stonith_action(FILE *stream, stonith_history_t *event) break; case st_failed: fprintf(stream, "
  • %s of %s failed: delegate=%s, " - "client=%s, origin=%s, %s='%s'
  • \n", + "client=%s, origin=%s, %s='%s' %s\n", action_s, event->target, event->delegate ? event->delegate : "", event->client, event->origin, fence_full_history?"completed":"last-failed", - run_at_s?run_at_s:""); + run_at_s?run_at_s:"", + is_later_succeeded(event, top_history) ? "(a later attempt succeeded)" : ""); break; default: fprintf(stream, "
  • %s of %s pending: client=%s, " @@ -3357,7 +3365,7 @@ print_failed_stonith_actions(FILE *stream, stonith_history_t *history) /* Print each failed stonith action */ for (hp = history; hp; hp = hp->next) { if (hp->state == st_failed) { - print_stonith_action(stream, hp); + print_stonith_action(stream, hp, history); } } @@ -3412,7 +3420,7 @@ print_stonith_pending(FILE *stream, stonith_history_t *history) if ((hp->state == st_failed) || (hp->state == st_done)) { break; } - print_stonith_action(stream, hp); + print_stonith_action(stream, hp, NULL); } /* End section */ @@ -3463,7 +3471,7 @@ print_stonith_history(FILE *stream, stonith_history_t *history) for (hp = history; hp; hp = hp->next) { if ((hp->state != st_failed) || (output_format == mon_output_xml)) { - print_stonith_action(stream, hp); + print_stonith_action(stream, hp, NULL); } } @@ -4753,10 +4761,10 @@ mon_refresh_display(gpointer user_data) fprintf(stderr, "Critical: Unable to get stonith-history\n"); mon_cib_connection_destroy(NULL); } else { + stonith_history = sort_stonith_history(stonith_history); if ((!fence_full_history) && (output_format != mon_output_xml)) { stonith_history = reduce_stonith_history(stonith_history); } - stonith_history = sort_stonith_history(stonith_history); break; /* all other cases are errors */ } } else { diff --git a/tools/crm_report.in b/tools/crm_report.in index 0ef4e6f526d..4fc2060fcac 100755 --- a/tools/crm_report.in +++ b/tools/crm_report.in @@ -20,7 +20,7 @@ TEMP=`getopt \ -o hv?xl:f:t:n:T:L:p:c:dSACHu:D:MVse: \ - --long help,corosync,cts:,cts-log:,dest:,heartbeat,node:,nodes:,--openais,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ + --long help,corosync,cts:,cts-log:,dest:,heartbeat,node:,nodes:,openais,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ -n 'crm_report' -- "$@"` # The quotes around $TEMP are essential eval set -- "$TEMP" diff --git a/version.m4 b/version.m4 index 2ce1122a81a..20845dfff51 100644 --- a/version.m4 +++ b/version.m4 @@ -1,2 +1,2 @@ -m4_define([VERSION_NUMBER], [1.1.21]) +m4_define([VERSION_NUMBER], [1.1.22]) m4_define([PCMK_URL], [https://clusterlabs.org/])