Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Warn when signal is handled #5186

Merged
merged 8 commits into from
May 5, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 79 additions & 33 deletions cloudinit/cmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@
"once": PER_ONCE,
}

# https://cloudinit.readthedocs.io/en/latest/explanation/boot.html
STAGE_NAME = {
"init-local": "Local Stage",
"init": "Network Stage",
"modules-config": "Config Stage",
"modules-final": "Final Stage",
}

LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -737,18 +745,31 @@ def status_wrapper(name, args):
else:
raise ValueError("unknown name: %s" % name)

modes = (
if mode not in (
holmanb marked this conversation as resolved.
Show resolved Hide resolved
"init",
"init-local",
"modules-config",
"modules-final",
)
if mode not in modes:
):
raise ValueError(
"Invalid cloud init mode specified '{0}'".format(mode)
)

status = None
nullstatus = {
"errors": [],
"recoverable_errors": {},
"start": None,
"finished": None,
}
status = {
"v1": {
"datasource": None,
"init": nullstatus.copy(),
"init-local": nullstatus.copy(),
"modules-config": nullstatus.copy(),
"modules-final": nullstatus.copy(),
}
}
if mode == "init-local":
for f in (status_link, result_link, status_path, result_path):
util.del_file(f)
Expand All @@ -758,25 +779,21 @@ def status_wrapper(name, args):
except Exception:
pass

nullstatus = {
"errors": [],
"start": None,
"finished": None,
}

if status is None:
status = {"v1": {}}
status["v1"]["datasource"] = None

for m in modes:
if m not in status["v1"]:
status["v1"][m] = nullstatus.copy()
if mode not in status["v1"]:
# this should never happen, but leave it just to be safe
status["v1"][mode] = nullstatus.copy()

v1 = status["v1"]
v1["stage"] = mode
uptime = util.uptime()
v1[mode]["start"] = float(uptime)
v1[mode]["recoverable_errors"] = next(
if v1[mode]["start"] and not v1[mode]["finished"]:
# This stage was restarted, which isn't expected.
LOG.warning(
"Unexpected start time found for %s. Was this stage restarted?",
STAGE_NAME[mode],
)

v1[mode]["start"] = float(util.uptime())
preexisting_recoverable_errors = next(
filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers)
).export_logs()

Expand All @@ -795,27 +812,56 @@ def status_wrapper(name, args):
else:
errors = ret

v1[mode]["errors"] = [str(e) for e in errors]
v1[mode]["errors"].extend([str(e) for e in errors])

except Exception as e:
util.logexc(LOG, "failed stage %s", mode)
LOG.exception("failed stage %s", mode)
print_exc("failed run of stage %s" % mode)
v1[mode]["errors"] = [str(e)]

v1[mode]["finished"] = float(util.uptime())
v1["stage"] = None
v1[mode]["errors"].append(str(e))
raise
except SystemExit as e:
# silence a pylint false positive
# https://github.com/pylint-dev/pylint/issues/9556
if e.code: # pylint: disable=using-constant-test
# Only log errors when sys.exit() is called with a non-zero
# exit code
LOG.exception("failed stage %s", mode)
print_exc("failed run of stage %s" % mode)
v1[mode]["errors"].append(f"sys.exit({str(e.code)}) called")
raise
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still want to raise if exit code is 0?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question - and I'm not sure myself.

Raising will cause cloud-init to exit more quickly, but there's not really much else that cloud-init does after this point in the code.

If we don't raise, I only see three more things that cloud-init will do before exiting:

  1. write result.json (if in Final stage)
  2. log completion of the stage and the time it took to complete("cloud-init mode %s")
  3. flush reporting events

I lean slightly towards cloud-init doing the same behavior for both zero and non-zero exit modes. Prior to this change, none of these three were done when sys.exit() was called. Is there any harm in trying to write out these logs if they do happen? I assume that any service that systemd tries to restart with SIGTINT will just receive a bigger hammer after a timeout period to ensure that they actually restart, but I'm not certain of that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm leaning towards not re-raising from the SystemExit handler at all. What do you think @TheRealFalcon?

finally:
# It is desireable to write to status.json before exiting, even when
# sys.exit() is called.
v1[mode]["finished"] = float(util.uptime())
v1["stage"] = None

# merge new recoverable errors into existing recoverable error list
new_recoverable_errors = next(
filter(
lambda h: isinstance(h, log.LogExporter), root_logger.handlers
)
).export_logs()
for key in new_recoverable_errors.keys():
if key in preexisting_recoverable_errors:
v1[mode]["recoverable_errors"][key] = list(
set(
preexisting_recoverable_errors[key]
+ new_recoverable_errors[key]
)
)
else:
v1[mode]["recoverable_errors"][key] = new_recoverable_errors[
key
]

# Write status.json after running init / module code
v1[mode]["recoverable_errors"] = next(
filter(lambda h: isinstance(h, log.LogExporter), root_logger.handlers)
).export_logs()
atomic_helper.write_json(status_path, status)
# Write status.json after running init / module code
atomic_helper.write_json(status_path, status)

if mode == "modules-final":
# write the 'finished' file
errors = []
for m in modes:
if v1[m]["errors"]:
for m in v1.keys():
if isinstance(v1[m], dict) and v1[m].get("errors"):
errors.extend(v1[m].get("errors", []))

atomic_helper.write_json(
Expand Down
2 changes: 1 addition & 1 deletion cloudinit/signal_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _handle_exit(signum, frame):
contents = StringIO()
contents.write("%s\n" % (msg))
_pprint_frame(frame, 1, BACK_FRAME_TRACE_DEPTH, contents)
util.multi_log(contents.getvalue(), console=True, stderr=False, log=LOG)
util.multi_log(contents.getvalue(), log=LOG, log_level=logging.ERROR)
sys.exit(rc)


Expand Down
4 changes: 4 additions & 0 deletions tests/integration_tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ def verify_clean_log(log: str, ignore_deprecations: bool = True):
raise AssertionError(
"Found unexpected errors: %s" % "\n".join(error_logs)
)
if re.findall("Cloud-init.*received SIG", log):
raise AssertionError(
"Found unexpected signal termination: %s" % "\n".join(error_logs)
)

warning_count = log.count("[WARNING]")
expected_warnings = 0
Expand Down
142 changes: 137 additions & 5 deletions tests/unittests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import contextlib
import io
import json
import logging
import os
import sys
from collections import namedtuple

import pytest
Expand All @@ -16,6 +18,7 @@

M_PATH = "cloudinit.cmd.main."
Tmpdir = namedtuple("Tmpdir", ["tmpdir", "link_d", "data_d"])
FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])


@pytest.fixture()
Expand Down Expand Up @@ -76,7 +79,6 @@ def _call_main(self, sysv_args=None):
def test_status_wrapper_errors(
self, action, name, match, caplog, mock_status_wrapper
):
FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])
my_action = mock.Mock()

myargs = FakeArgs((action, my_action), False, "bogusmode")
Expand All @@ -102,8 +104,6 @@ def test_status_wrapper_init_local_writes_fresh_status_info(
str(_dir), {"status.json": "old", "result.json": "old"}
)

FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])

def myaction(name, args):
# Return an error to watch status capture them
return "SomeDatasource", ["an error"]
Expand Down Expand Up @@ -140,8 +140,6 @@ def test_status_wrapper_init_local_honor_cloud_dir(
data_d = mock_status_wrapper.data_d
link_d = mock_status_wrapper.link_d

FakeArgs = namedtuple("FakeArgs", ["action", "local", "mode"])

def myaction(name, args):
# Return an error to watch status capture them
return "SomeDatasource", ["an_error"]
Expand Down Expand Up @@ -414,3 +412,137 @@ def test_features_hook_subcommand(self, m_features):
assert "features" == parseargs.action[0]
assert False is parseargs.debug
assert False is parseargs.force


class TestSignalHandling:
@mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
def test_status_wrapper_signal_sys_exit(
self,
m_json,
mocker,
mock_status_wrapper,
):
"""make sure that when sys.exit(N) is called, the correct code is
called
"""
for code in [1, 2, 3, 4]:
with pytest.raises(SystemExit) as e:
cli.status_wrapper(
"init",
FakeArgs(
(
None,
# silence pylint false positive
# https://github.com/pylint-dev/pylint/issues/9557
lambda *_: sys.exit(code), # pylint: disable=W0640
),
False,
"bogusmode",
),
)
assert code == e.value.code

# assert that the status shows errors
assert (
f"sys.exit({code}) called"
in m_json.call_args[0][1]["v1"]["init"]["errors"]
)

@mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
def test_status_wrapper_no_signal_sys_exit(
self,
m_json,
mock_status_wrapper,
):
"""if sys.exit() is called from a non-signal handler, make sure that
cloud-init doesn't interfere - just re-raise"""
# call status_wrapper() with the required args
with pytest.raises(SystemExit) as e:
cli.status_wrapper(
"init",
FakeArgs(
(
None,
lambda *_: sys.exit(1),
),
False,
"bogusmode",
),
)
assert 1 == e.code
holmanb marked this conversation as resolved.
Show resolved Hide resolved

@mock.patch("cloudinit.cmd.main.atomic_helper.write_json")
def test_status_wrapper_signal_warnings(
self,
m_json,
mock_status_wrapper,
):
"""If a stage is started and status.json already has a start time but
no end time for that stage, this is an unknown state - make sure that
a warning is logged.
"""

# Write a status.json to the mocked temporary directory
for dir in mock_status_wrapper.data_d, mock_status_wrapper.link_d:
test_helpers.populate_dir(
str(dir),
{
"status.json": json.dumps(
{
"v1": {
"stage": "init",
"datasource": (
"DataSourceNoCloud "
"[seed=/var/.../seed/nocloud-net]"
"[dsmode=net]"
),
"init": {
"errors": [],
"recoverable_errors": {},
"start": 124.567,
"finished": None,
},
"init-local": {
"errors": [],
"recoverable_errors": {},
"start": 100.0,
"finished": 100.00001,
},
"modules-config": {
"errors": [],
"recoverable_errors": {},
"start": None,
"finished": None,
},
"modules-final": {
"errors": [],
"recoverable_errors": {},
"start": None,
"finished": None,
},
}
}
)
},
)
# call status_wrapper() with the required args
cli.status_wrapper(
"init",
FakeArgs(
(
None,
lambda *_: ("SomeDataSource", []),
),
False,
"bogusmode",
),
)

# assert that the status shows recoverable errors
assert (
"Unexpected start time found for Network Stage. "
"Was this stage restarted?"
in m_json.call_args[0][1]["v1"]["init"]["recoverable_errors"][
"WARNING"
]
)
Loading