Skip to content

Commit

Permalink
fix(vsock): save state after sending a notification
Browse files Browse the repository at this point in the history
This is a fix for a fix introduced in #4796
The issue was in vsock device hanging after snapshot
restoration due to the guest not being notified about
the termination packet. But there was bug in the fix, maily
we saved the vsock state before the notification was sent,
thus discarding all modifications made to sent the notification.

The reason original fix worked, is because we were only testing
with 1 iteration of snap/restore. This way even though we lost
synchronization with the guest in the event queue state, it worked
fine once. But doing more iterations causes vsock to hang
as before.

This commit fixes the issue by storing vsock state after the
notification is sent and modifies the vsock test to run
multiple iterations of snap/restore.

Signed-off-by: Egor Lazarchuk <yegorlz@amazon.co.uk>
  • Loading branch information
ShadowCurse authored and bchalios committed Oct 8, 2024
1 parent 121fab6 commit f6f21b4
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 53 deletions.
12 changes: 7 additions & 5 deletions src/vmm/src/device_manager/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,6 @@ impl<'a> Persist<'a> for MMIODeviceManager {
.downcast_mut::<Vsock<VsockUnixBackend>>()
.unwrap();

let vsock_state = VsockState {
backend: vsock.backend().save(),
frontend: vsock.save(),
};

// Send Transport event to reset connections if device
// is activated.
if vsock.is_activated() {
Expand All @@ -378,6 +373,13 @@ impl<'a> Persist<'a> for MMIODeviceManager {
});
}

// Save state after potential notification to the guest. This
// way we save changes to the queue the notification can cause.
let vsock_state = VsockState {
backend: vsock.backend().save(),
frontend: vsock.save(),
};

states.vsock_device = Some(ConnectedVsockState {
device_id: devid.clone(),
device_state: vsock_state,
Expand Down
103 changes: 55 additions & 48 deletions tests/integration_tests/functional/test_vsock.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,54 +227,61 @@ def test_vsock_transport_reset_g2h(uvm_nano, microvm_factory):
test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}")
test_vm.start()

host_socket_path = os.path.join(
test_vm.path, f"{VSOCK_UDS_PATH}_{ECHO_SERVER_PORT}"
)
host_socat_commmand = [
"socat",
"-dddd",
f"UNIX-LISTEN:{host_socket_path},fork",
"STDOUT",
]
host_socat = subprocess.Popen(
host_socat_commmand, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)

# Give some time for host socat to create socket
time.sleep(0.5)
assert Path(host_socket_path).exists()
test_vm.create_jailed_resource(host_socket_path)

# Create a socat process in the guest which will connect to the host socat
guest_socat_commmand = f"tmux new -d 'socat - vsock-connect:2:{ECHO_SERVER_PORT}'"
test_vm.ssh.run(guest_socat_commmand)

# socat should be running in the guest now
code, _, _ = test_vm.ssh.run("pidof socat")
assert code == 0

# Create snapshot.
# Create snapshot and terminate a VM.
snapshot = test_vm.snapshot_full()
test_vm.resume()

# After `create_snapshot` + 'restore' calls, connection should be dropped
code, _, _ = test_vm.ssh.run("pidof socat")
assert code == 1

# Kill host socat as it is not useful anymore
host_socat.kill()
host_socat.communicate()

# Terminate VM.
test_vm.kill()

# Load snapshot.
vm2 = microvm_factory.build()
vm2.spawn()
vm2.restore_from_snapshot(snapshot, resume=True)

# After snap restore all vsock connections should be
# dropped. This means guest socat should exit same way
# as it did after snapshot was taken.
code, _, _ = vm2.ssh.run("pidof socat")
assert code == 1
for _ in range(5):
# Load snapshot.
new_vm = microvm_factory.build()
new_vm.spawn()
new_vm.restore_from_snapshot(snapshot, resume=True)

# After snap restore all vsock connections should be
# dropped. This means guest socat should exit same way
# as it did after snapshot was taken.
code, _, _ = new_vm.ssh.run("pidof socat")
assert code == 1

host_socket_path = os.path.join(
new_vm.path, f"{VSOCK_UDS_PATH}_{ECHO_SERVER_PORT}"
)
host_socat_commmand = [
"socat",
"-dddd",
f"UNIX-LISTEN:{host_socket_path},fork",
"STDOUT",
]
host_socat = subprocess.Popen(
host_socat_commmand, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)

# Give some time for host socat to create socket
time.sleep(0.5)
assert Path(host_socket_path).exists()
new_vm.create_jailed_resource(host_socket_path)

# Create a socat process in the guest which will connect to the host socat
guest_socat_commmand = (
f"tmux new -d 'socat - vsock-connect:2:{ECHO_SERVER_PORT}'"
)
new_vm.ssh.run(guest_socat_commmand)

# socat should be running in the guest now
code, _, _ = new_vm.ssh.run("pidof socat")
assert code == 0

# Create snapshot.
snapshot = new_vm.snapshot_full()
new_vm.resume()

# After `create_snapshot` + 'restore' calls, connection should be dropped
code, _, _ = new_vm.ssh.run("pidof socat")
assert code == 1

# Kill host socat as it is not useful anymore
host_socat.kill()
host_socat.communicate()

# Terminate VM.
new_vm.kill()

0 comments on commit f6f21b4

Please sign in to comment.