Skip to content

Commit

Permalink
fix: Warn when signal is handled (#5186)
Browse files Browse the repository at this point in the history
Cloud-init is expected to run to completion. It does not expect to be
stopped or restarted. Therefore, cloud-init should loudly warn when this
happens so that the user is aware that the unexpected has happened. 

Problems:

- When systemd stops cloud-init, no warning is logged.
- When systemd restarts cloud-init, status info and return code from the
  current stage is lost.

The net effect of these problems is that when cloud-init is restarted
no warning is logged, and cloud-init status indicates no problem.

Communicate unexpected state by:

- Increase signal handling log level to ERROR, add log to stderr.
- Write status.json before exiting when sys.exit() is called.
- Warn when status.json contains existing data when the stage
  starts[1].
- Append existing errors and recoverable status.json errors rather
  than overwriting[2].

This should make cloud-init properly warn in status output and logs that
something is amiss.

[1] This should never happen and indicates that something restarted the
    service. When cloud-init doesn't have the opportunity to handle a
    signal (SIGKILL), other mitigations will fail.
[2] Pre-existing warnings were previously silently lost from
    `cloud-init status`'s output.

Fixes GH-5190
  • Loading branch information
holmanb authored May 5, 2024
1 parent 51c6569 commit f24ecef
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 44 deletions.
117 changes: 79 additions & 38 deletions cloudinit/cmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@
"once": PER_ONCE,
}

# https://cloudinit.readthedocs.io/en/latest/explanation/boot.html
STAGE_NAME = {
"init-local": "Local Stage",
"init": "Network Stage",
"modules-config": "Config Stage",
"modules-final": "Final Stage",
}

LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -737,18 +745,26 @@ def status_wrapper(name, args):
else:
raise ValueError("unknown name: %s" % name)

modes = (
"init",
"init-local",
"modules-config",
"modules-final",
)
if mode not in modes:
if mode not in STAGE_NAME:
raise ValueError(
"Invalid cloud init mode specified '{0}'".format(mode)
)

status = None
nullstatus = {
"errors": [],
"recoverable_errors": {},
"start": None,
"finished": None,
}
status = {
"v1": {
"datasource": None,
"init": nullstatus.copy(),
"init-local": nullstatus.copy(),
"modules-config": nullstatus.copy(),
"modules-final": nullstatus.copy(),
}
}
if mode == "init-local":
for f in (status_link, result_link, status_path, result_path):
util.del_file(f)
Expand All @@ -758,25 +774,21 @@ def status_wrapper(name, args):
except Exception:
pass

nullstatus = {
"errors": [],
"start": None,
"finished": None,
}

if status is None:
status = {"v1": {}}
status["v1"]["datasource"] = None

for m in modes:
if m not in status["v1"]:
status["v1"][m] = nullstatus.copy()
if mode not in status["v1"]:
# this should never happen, but leave it just to be safe
status["v1"][mode] = nullstatus.copy()

v1 = status["v1"]
v1["stage"] = mode
uptime = util.uptime()
v1[mode]["start"] = float(uptime)
v1[mode]["recoverable_errors"] = next(
if v1[mode]["start"] and not v1[mode]["finished"]:
# This stage was restarted, which isn't expected.
LOG.warning(
"Unexpected start time found for %s. Was this stage restarted?",
STAGE_NAME[mode],
)

v1[mode]["start"] = float(util.uptime())
preexisting_recoverable_errors = next(
filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers)
).export_logs()

Expand All @@ -795,27 +807,56 @@ def status_wrapper(name, args):
else:
errors = ret

v1[mode]["errors"] = [str(e) for e in errors]

v1[mode]["errors"].extend([str(e) for e in errors])
except Exception as e:
util.logexc(LOG, "failed stage %s", mode)
LOG.exception("failed stage %s", mode)
print_exc("failed run of stage %s" % mode)
v1[mode]["errors"] = [str(e)]

v1[mode]["finished"] = float(util.uptime())
v1["stage"] = None
v1[mode]["errors"].append(str(e))
except SystemExit as e:
# All calls to sys.exit() resume running here.
# silence a pylint false positive
# https://github.com/pylint-dev/pylint/issues/9556
if e.code: # pylint: disable=using-constant-test
# Only log errors when sys.exit() is called with a non-zero
# exit code
LOG.exception("failed stage %s", mode)
print_exc("failed run of stage %s" % mode)
v1[mode]["errors"].append(f"sys.exit({str(e.code)}) called")
finally:
# Before it exits, cloud-init will:
# 1) Write status.json (and result.json if in Final stage).
# 2) Write the final log message containing module run time.
# 3) Flush any queued reporting event handlers.
v1[mode]["finished"] = float(util.uptime())
v1["stage"] = None

# merge new recoverable errors into existing recoverable error list
new_recoverable_errors = next(
filter(
lambda h: isinstance(h, log.LogExporter), root_logger.handlers
)
).export_logs()
for key in new_recoverable_errors.keys():
if key in preexisting_recoverable_errors:
v1[mode]["recoverable_errors"][key] = list(
set(
preexisting_recoverable_errors[key]
+ new_recoverable_errors[key]
)
)
else:
v1[mode]["recoverable_errors"][key] = new_recoverable_errors[
key
]

# Write status.json after running init / module code
v1[mode]["recoverable_errors"] = next(
filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers)
).export_logs()
atomic_helper.write_json(status_path, status)
# Write status.json after running init / module code
atomic_helper.write_json(status_path, status)

if mode == "modules-final":
# write the 'finished' file
errors = []
for m in modes:
if v1[m]["errors"]:
for m in v1.keys():
if isinstance(v1[m], dict) and v1[m].get("errors"):
errors.extend(v1[m].get("errors", []))

atomic_helper.write_json(
Expand Down
2 changes: 1 addition & 1 deletion cloudinit/signal_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _handle_exit(signum, frame):
contents = StringIO()
contents.write("%s\n" % (msg))
_pprint_frame(frame, 1, BACK_FRAME_TRACE_DEPTH, contents)
util.multi_log(contents.getvalue(), console=True, stderr=False, log=LOG)
util.multi_log(contents.getvalue(), log=LOG, log_level=logging.ERROR)
sys.exit(rc)


Expand Down
4 changes: 4 additions & 0 deletions tests/integration_tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ def verify_clean_log(log: str, ignore_deprecations: bool = True):
raise AssertionError(
"Found unexpected errors: %s" % "\n".join(error_logs)
)
if re.findall("Cloud-init.*received SIG", log):
raise AssertionError(
"Found unexpected signal termination: %s" % "\n".join(error_logs)
)

warning_count = log.count("[WARNING]")
expected_warnings = 0
Expand Down
141 changes: 136 additions & 5 deletions tests/unittests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import contextlib
import io
import json
import logging
import os
import sys
from collections import namedtuple

import pytest
Expand All @@ -16,6 +18,7 @@

M_PATH = "cloudinit.cmd.main."
Tmpdir = namedtuple("Tmpdir", ["tmpdir", "link_d", "data_d"])
FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])


@pytest.fixture()
Expand Down Expand Up @@ -76,7 +79,6 @@ def _call_main(self, sysv_args=None):
def test_status_wrapper_errors(
self, action, name, match, caplog, mock_status_wrapper
):
FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])
my_action = mock.Mock()

myargs = FakeArgs((action, my_action), False, "bogusmode")
Expand All @@ -102,8 +104,6 @@ def test_status_wrapper_init_local_writes_fresh_status_info(
str(_dir), {"status.json": "old", "result.json": "old"}
)

FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])

def myaction(name, args):
# Return an error to watch status capture them
return "SomeDatasource", ["an error"]
Expand Down Expand Up @@ -140,8 +140,6 @@ def test_status_wrapper_init_local_honor_cloud_dir(
data_d = mock_status_wrapper.data_d
link_d = mock_status_wrapper.link_d

FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])

def myaction(name, args):
# Return an error to watch status capture them
return "SomeDatasource", ["an_error"]
Expand Down Expand Up @@ -414,3 +412,136 @@ def test_features_hook_subcommand(self, m_features):
assert "features" == parseargs.action[0]
assert False is parseargs.debug
assert False is parseargs.force


class TestSignalHandling:
@mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
def test_status_wrapper_signal_sys_exit(
self,
m_json,
mocker,
mock_status_wrapper,
):
"""make sure that when sys.exit(N) is called, the correct code is
returned
"""
for code in [1, 2, 3, 4]:
rc = cli.status_wrapper(
"init",
FakeArgs(
(
None,
# silence pylint false positive
# https://github.com/pylint-dev/pylint/issues/9557
lambda *_: sys.exit(code), # pylint: disable=W0640
),
False,
"bogusmode",
),
)
assert 1 == rc

# assert that the status shows errors
assert (
f"sys.exit({code}) called"
in m_json.call_args[0][1]["v1"]["init"]["errors"]
)

@mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
def test_status_wrapper_no_signal_sys_exit(
self,
m_json,
mock_status_wrapper,
):
"""if sys.exit(0) is called, make sure that cloud-init doesn't log a
warning"""
# call status_wrapper() with the required args
rc = cli.status_wrapper(
"init",
FakeArgs(
(
None,
lambda *_: sys.exit(0),
),
False,
"bogusmode",
),
)
assert 0 == rc
assert not m_json.call_args[0][1]["v1"]["init"]["errors"]

@mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
def test_status_wrapper_signal_warnings(
self,
m_json,
mock_status_wrapper,
):
"""If a stage is started and status.json already has a start time but
no end time for that stage, this is an unknown state - make sure that
a warning is logged.
"""

# Write a status.json to the mocked temporary directory
for dir in mock_status_wrapper.data_d, mock_status_wrapper.link_d:
test_helpers.populate_dir(
str(dir),
{
"status.json": json.dumps(
{
"v1": {
"stage": "init",
"datasource": (
"DataSourceNoCloud "
"[seed=/var/.../seed/nocloud-net]"
"[dsmode=net]"
),
"init": {
"errors": [],
"recoverable_errors": {},
"start": 124.567,
"finished": None,
},
"init-local": {
"errors": [],
"recoverable_errors": {},
"start": 100.0,
"finished": 100.00001,
},
"modules-config": {
"errors": [],
"recoverable_errors": {},
"start": None,
"finished": None,
},
"modules-final": {
"errors": [],
"recoverable_errors": {},
"start": None,
"finished": None,
},
}
}
)
},
)
# call status_wrapper() with the required args
cli.status_wrapper(
"init",
FakeArgs(
(
None,
lambda *_: ("SomeDataSource", []),
),
False,
"bogusmode",
),
)

# assert that the status shows recoverable errors
assert (
"Unexpected start time found for Network Stage. "
"Was this stage restarted?"
in m_json.call_args[0][1]["v1"]["init"]["recoverable_errors"][
"WARNING"
]
)

0 comments on commit f24ecef

Please sign in to comment.