Skip to content

Commit

Permalink
Merge branch 'main' into seccomp-cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
roypat authored Dec 13, 2024
2 parents d9ca827 + 77d3704 commit fa67b7c
Show file tree
Hide file tree
Showing 15 changed files with 188 additions and 498 deletions.
4 changes: 2 additions & 2 deletions .gitlint
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ line-length=72

[ignore-body-lines]
# Ignore HTTP reference links
# Ignore lines that start with 'Co-Authored-By' or with 'Signed-off-by'
regex=(^\[.+\]: http.+)|(^Co-Authored-By)|(^Signed-off-by)
# Ignore lines that start with 'Co-Authored-By', with 'Signed-off-by' or with 'Fixes'
regex=(^\[.+\]: http.+)|(^Co-Authored-By)|(^Signed-off-by)|(^Fixes:)

[ignore-by-author-name]
# Ignore certain rules for commits of which the author name matches a regex
Expand Down
11 changes: 10 additions & 1 deletion tests/framework/microvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ def __init__(
self.mem_size_bytes = None
self.cpu_template_name = None

self._connections = []

self._pre_cmd = []
if numa_node:
node_str = str(numa_node)
Expand Down Expand Up @@ -282,6 +284,10 @@ def kill(self):
for monitor in self.monitors:
monitor.stop()

# Kill all background SSH connections
for connection in self._connections:
connection.close()

# We start with vhost-user backends,
# because if we stop Firecracker first, the backend will want
# to exit as well and this will cause a race condition.
Expand Down Expand Up @@ -1007,13 +1013,16 @@ def ssh_iface(self, iface_idx=0):
"""Return a cached SSH connection on a given interface id."""
guest_ip = list(self.iface.values())[iface_idx]["iface"].guest_ip
self.ssh_key = Path(self.ssh_key)
return net_tools.SSHConnection(
connection = net_tools.SSHConnection(
netns=self.netns.id,
ssh_key=self.ssh_key,
user="root",
host=guest_ip,
control_path=Path(self.chroot()) / f"ssh-{iface_idx}.sock",
on_error=self._dump_debug_information,
)
self._connections.append(connection)
return connection

@property
def ssh(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/framework/microvm_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,6 @@ def trace_cmd_guest(self, fns, cmd, port=4321):
print("guest> trace-cmd record")
host_ip = self.vm.iface["eth0"]["iface"].host_ip
_guest_ps = self.vm.ssh.run(
f"trace-cmd record -N {host_ip}:{port} -p function {" ".join(fns)} {cmd}"
f"trace-cmd record -N {host_ip}:{port} -p function {' '.join(fns)} {cmd}"
)
return list(Path(".").glob("trace.*.dat"))
6 changes: 6 additions & 0 deletions tests/framework/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,12 @@ def run_cmd(cmd, check=False, shell=True, cwd=None, timeout=None) -> CommandRetu
stdout, stderr = proc.communicate(timeout=timeout)
except subprocess.TimeoutExpired:
proc.kill()

# Sometimes stdout/stderr are passed on to children, in which case killing
# the parent won't close them and communicate will still hang.
proc.stdout.close()
proc.stderr.close()

stdout, stderr = proc.communicate()

# Log the message with one call so that multiple statuses
Expand Down
100 changes: 53 additions & 47 deletions tests/framework/utils_vsock.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from subprocess import Popen
from threading import Thread

from tenacity import Retrying, stop_after_attempt, wait_fixed

ECHO_SERVER_PORT = 5252
SERVER_ACCEPT_BACKLOG = 128
TEST_CONNECTION_COUNT = 50
Expand Down Expand Up @@ -142,53 +144,57 @@ def check_guest_connections(vm, server_port_path, blob_path, blob_hash):
["socat", f"UNIX-LISTEN:{server_port_path},fork,backlog=5", "exec:'/bin/cat'"]
)

# Link the listening Unix socket into the VM's jail, so that
# Firecracker can connect to it.
attempt = 0
# But 1st, give socat a bit of time to create the socket
while not Path(server_port_path).exists() and attempt < 3:
time.sleep(0.2)
attempt += 1
vm.create_jailed_resource(server_port_path)

# Increase maximum process count for the ssh service.
# Avoids: "bash: fork: retry: Resource temporarily unavailable"
# Needed to execute the bash script that tests for concurrent
# vsock guest initiated connections.
pids_max_file = "/sys/fs/cgroup/system.slice/ssh.service/pids.max"
ecode, _, _ = vm.ssh.run(f"echo 1024 > {pids_max_file}")
assert ecode == 0, "Unable to set max process count for guest ssh service."

# Build the guest worker sub-command.
# `vsock_helper` will read the blob file from STDIN and send the echo
# server response to STDOUT. This response is then hashed, and the
# hash is compared against `blob_hash` (computed on the host). This
# comparison sets the exit status of the worker command.
worker_cmd = "hash=$("
worker_cmd += "cat {}".format(blob_path)
worker_cmd += " | /tmp/vsock_helper echo 2 {}".format(ECHO_SERVER_PORT)
worker_cmd += " | md5sum | cut -f1 -d\\ "
worker_cmd += ")"
worker_cmd += ' && [[ "$hash" = "{}" ]]'.format(blob_hash)

# Run `TEST_CONNECTION_COUNT` concurrent workers, using the above
# worker sub-command.
# If any worker fails, this command will fail. If all worker sub-commands
# succeed, this will also succeed.
cmd = 'workers="";'
cmd += "for i in $(seq 1 {}); do".format(TEST_CONNECTION_COUNT)
cmd += " ({})& ".format(worker_cmd)
cmd += ' workers="$workers $!";'
cmd += "done;"
cmd += "for w in $workers; do wait $w || (wait; exit 1); done"

ecode, _, stderr = vm.ssh.run(cmd)
echo_server.terminate()
rc = echo_server.wait()
# socat exits with 128 + 15 (SIGTERM)
assert rc == 143

assert ecode == 0, stderr
try:
# Give socat a bit of time to create the socket
for attempt in Retrying(
wait=wait_fixed(0.2),
stop=stop_after_attempt(3),
reraise=True,
):
with attempt:
assert Path(server_port_path).exists()

# Link the listening Unix socket into the VM's jail, so that
# Firecracker can connect to it.
vm.create_jailed_resource(server_port_path)

# Increase maximum process count for the ssh service.
# Avoids: "bash: fork: retry: Resource temporarily unavailable"
# Needed to execute the bash script that tests for concurrent
# vsock guest initiated connections.
vm.ssh.check_output(
"echo 1024 > /sys/fs/cgroup/system.slice/ssh.service/pids.max"
)

# Build the guest worker sub-command.
# `vsock_helper` will read the blob file from STDIN and send the echo
# server response to STDOUT. This response is then hashed, and the
# hash is compared against `blob_hash` (computed on the host). This
# comparison sets the exit status of the worker command.
worker_cmd = "hash=$("
worker_cmd += "cat {}".format(blob_path)
worker_cmd += " | /tmp/vsock_helper echo 2 {}".format(ECHO_SERVER_PORT)
worker_cmd += " | md5sum | cut -f1 -d\\ "
worker_cmd += ")"
worker_cmd += ' && [[ "$hash" = "{}" ]]'.format(blob_hash)

# Run `TEST_CONNECTION_COUNT` concurrent workers, using the above
# worker sub-command.
# If any worker fails, this command will fail. If all worker sub-commands
# succeed, this will also succeed.
cmd = 'workers="";'
cmd += "for i in $(seq 1 {}); do".format(TEST_CONNECTION_COUNT)
cmd += " ({})& ".format(worker_cmd)
cmd += ' workers="$workers $!";'
cmd += "done;"
cmd += "for w in $workers; do wait $w || (wait; exit 1); done"

vm.ssh.check_output(cmd)
finally:
echo_server.terminate()
rc = echo_server.wait()
# socat exits with 128 + 15 (SIGTERM)
assert rc == 143


def make_host_port_path(uds_path, port):
Expand Down
129 changes: 84 additions & 45 deletions tests/host_tools/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,35 @@
"""Utilities for test host microVM network setup."""

import ipaddress
import os
import random
import re
import signal
import string
import subprocess
from dataclasses import dataclass, field
from pathlib import Path

from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from tenacity import retry, stop_after_attempt, wait_fixed

from framework import utils
from framework.utils import Timeout


class SSHConnection:
"""
SSHConnection encapsulates functionality for microVM SSH interaction.
This class should be instantiated as part of the ssh fixture with the
This class should be instantiated as part of the ssh fixture with
the hostname obtained from the MAC address, the username for logging into
the image and the path of the ssh key.
This translates into an SSH connection as follows:
ssh -i ssh_key_path username@hostname
Establishes a ControlMaster upon construction, which is then re-used
for all subsequent SSH interactions.
"""

def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None):
def __init__(
self, netns, ssh_key: Path, control_path: Path, host, user, *, on_error=None
):
"""Instantiate a SSH client and connect to a microVM."""
self.netns = netns
self.ssh_key = ssh_key
Expand All @@ -37,22 +42,13 @@ def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None):
assert (ssh_key.stat().st_mode & 0o777) == 0o400
self.host = host
self.user = user
self._control_path = control_path

self._on_error = None

self.options = [
"-o",
"LogLevel=ERROR",
"-o",
"ConnectTimeout=1",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-o",
"PreferredAuthentications=publickey",
"-i",
str(self.ssh_key),
f"ControlPath={self._control_path}",
]

# _init_connection loops until it can connect to the guest
Expand Down Expand Up @@ -96,39 +92,103 @@ def scp_get(self, remote_path, local_path, recursive=False):
self._scp(self.remote_path(remote_path), local_path, opts)

@retry(
retry=retry_if_exception_type(ChildProcessError),
wait=wait_fixed(0.5),
wait=wait_fixed(1),
stop=stop_after_attempt(20),
reraise=True,
)
def _init_connection(self):
"""Create an initial SSH client connection (retry until it works).
"""Initialize the persistent background connection which will be used
to execute all commands sent via this `SSHConnection` object.
Since we're connecting to a microVM we just started, we'll probably
have to wait for it to boot up and start the SSH server.
We'll keep trying to execute a remote command that can't fail
(`/bin/true`), until we get a successful (0) exit code.
"""
self.check_output("true", timeout=100, debug=True)
assert not self._control_path.exists()

# Sadly, we cannot get debug output from this command (e.g. `-vvv`),
# because passing -vvv causes the daemonized ssh to hold on to stderr,
# and inside utils.run_cmd we're using subprocess.communicate, which
# only returns once stderr gets closed (which would thus result in an
# indefinite hang).
establish_cmd = [
"ssh",
# Only need to pass the ssh key here, as all multiplexed
# connections won't have to re-authenticate
"-i",
str(self.ssh_key),
"-o",
"StrictHostKeyChecking=no",
"-o",
"ConnectTimeout=2",
# Set up a persistent background connection
"-o",
"ControlMaster=auto",
"-o",
"ControlPersist=yes",
*self.options,
self.user_host,
"/usr/bin/true",
]

# don't set a low timeout here, because otherwise we might get into a race condition
# where ssh already forked off the persisted connection daemon, but gets killed here
# before exiting itself. In that case, self._control_path will exist, and the retry
# will hit the assert at the start of this function.
self._exec(establish_cmd, check=True)

def run(self, cmd_string, timeout=None, *, check=False, debug=False):
def _check_liveness(self) -> int:
"""Checks whether the ControlPersist connection is still alive"""
check_cmd = ["ssh", "-O", "check", *self.options, self.user_host]

_, _, stderr = self._exec(check_cmd, check=True)

pid_match = re.match(r"Master running \(pid=(\d+)\)", stderr)

assert pid_match, f"SSH ControlMaster connection not alive anymore: {stderr}"

return int(pid_match.group(1))

def close(self):
"""Closes the ControlPersist connection"""
master_pid = self._check_liveness()

stop_cmd = ["ssh", "-O", "stop", *self.options, self.user_host]

_, _, stderr = self._exec(stop_cmd, check=True)

assert "Stop listening request sent" in stderr

try:
with Timeout(5):
utils.wait_process_termination(master_pid)
except TimeoutError:
# for some reason it won't exit, let's force it...
# if this also fails, when during teardown we'll get an error about
# "found a process with supposedly dead Firecracker's jailer ID"
os.kill(master_pid, signal.SIGKILL)

def run(self, cmd_string, timeout=100, *, check=False, debug=False):
"""
Execute the command passed as a string in the ssh context.
If `debug` is set, pass `-vvv` to `ssh`. Note that this will clobber stderr.
"""
self._check_liveness()

command = ["ssh", *self.options, self.user_host, cmd_string]

if debug:
command.insert(1, "-vvv")

return self._exec(command, timeout, check=check)

def check_output(self, cmd_string, timeout=None, *, debug=False):
def check_output(self, cmd_string, timeout=100, *, debug=False):
"""Same as `run`, but raises an exception on non-zero return code of remote command"""
return self.run(cmd_string, timeout, check=True, debug=debug)

def _exec(self, cmd, timeout=None, check=False):
def _exec(self, cmd, timeout=100, check=False):
"""Private function that handles the ssh client invocation."""
if self.netns is not None:
cmd = ["ip", "netns", "exec", self.netns] + cmd
Expand All @@ -141,27 +201,6 @@ def _exec(self, cmd, timeout=None, check=False):

raise

# pylint:disable=invalid-name
def Popen(
self,
cmd: str,
stdin=subprocess.DEVNULL,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
**kwargs,
) -> subprocess.Popen:
"""Execute the command in the guest and return a Popen object.
pop = uvm.ssh.Popen("while true; do echo $(date -Is) $RANDOM; sleep 1; done")
pop.stdout.read(16)
"""
cmd = ["ssh", *self.options, self.user_host, cmd]
if self.netns is not None:
cmd = ["ip", "netns", "exec", self.netns] + cmd
return subprocess.Popen(
cmd, stdin=stdin, stdout=stdout, stderr=stderr, **kwargs
)


def mac_from_ip(ip_address):
"""Create a MAC address based on the provided IP.
Expand Down
Loading

0 comments on commit fa67b7c

Please sign in to comment.