From f6f21b4e3c0bb34b3d725a3837c66a29fc3b7b32 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Wed, 18 Sep 2024 12:26:20 +0000 Subject: [PATCH] fix(vsock): save state after sending a notification This is a fix for a fix introduced in #4796 The issue was in vsock device hanging after snapshot restoration due to the guest not being notified about the termination packet. But there was bug in the fix, maily we saved the vsock state before the notification was sent, thus discarding all modifications made to sent the notification. The reason original fix worked, is because we were only testing with 1 iteration of snap/restore. This way even though we lost synchronization with the guest in the event queue state, it worked fine once. But doing more iterations causes vsock to hang as before. This commit fixes the issue by storing vsock state after the notification is sent and modifies the vsock test to run multiple iterations of snap/restore. Signed-off-by: Egor Lazarchuk --- src/vmm/src/device_manager/persist.rs | 12 +- .../functional/test_vsock.py | 103 ++++++++++-------- 2 files changed, 62 insertions(+), 53 deletions(-) diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 7a51bf790e9..1a7505fb1dd 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -365,11 +365,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .downcast_mut::>() .unwrap(); - let vsock_state = VsockState { - backend: vsock.backend().save(), - frontend: vsock.save(), - }; - // Send Transport event to reset connections if device // is activated. if vsock.is_activated() { @@ -378,6 +373,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { }); } + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock.backend().save(), + frontend: vsock.save(), + }; + states.vsock_device = Some(ConnectedVsockState { device_id: devid.clone(), device_state: vsock_state, diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index 104fbada886..95f52c670b4 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -227,54 +227,61 @@ def test_vsock_transport_reset_g2h(uvm_nano, microvm_factory): test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start() - host_socket_path = os.path.join( - test_vm.path, f"{VSOCK_UDS_PATH}_{ECHO_SERVER_PORT}" - ) - host_socat_commmand = [ - "socat", - "-dddd", - f"UNIX-LISTEN:{host_socket_path},fork", - "STDOUT", - ] - host_socat = subprocess.Popen( - host_socat_commmand, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - - # Give some time for host socat to create socket - time.sleep(0.5) - assert Path(host_socket_path).exists() - test_vm.create_jailed_resource(host_socket_path) - - # Create a socat process in the guest which will connect to the host socat - guest_socat_commmand = f"tmux new -d 'socat - vsock-connect:2:{ECHO_SERVER_PORT}'" - test_vm.ssh.run(guest_socat_commmand) - - # socat should be running in the guest now - code, _, _ = test_vm.ssh.run("pidof socat") - assert code == 0 - - # Create snapshot. + # Create snapshot and terminate a VM. snapshot = test_vm.snapshot_full() - test_vm.resume() - - # After `create_snapshot` + 'restore' calls, connection should be dropped - code, _, _ = test_vm.ssh.run("pidof socat") - assert code == 1 - - # Kill host socat as it is not useful anymore - host_socat.kill() - host_socat.communicate() - - # Terminate VM. test_vm.kill() - # Load snapshot. - vm2 = microvm_factory.build() - vm2.spawn() - vm2.restore_from_snapshot(snapshot, resume=True) - - # After snap restore all vsock connections should be - # dropped. This means guest socat should exit same way - # as it did after snapshot was taken. - code, _, _ = vm2.ssh.run("pidof socat") - assert code == 1 + for _ in range(5): + # Load snapshot. + new_vm = microvm_factory.build() + new_vm.spawn() + new_vm.restore_from_snapshot(snapshot, resume=True) + + # After snap restore all vsock connections should be + # dropped. This means guest socat should exit same way + # as it did after snapshot was taken. + code, _, _ = new_vm.ssh.run("pidof socat") + assert code == 1 + + host_socket_path = os.path.join( + new_vm.path, f"{VSOCK_UDS_PATH}_{ECHO_SERVER_PORT}" + ) + host_socat_commmand = [ + "socat", + "-dddd", + f"UNIX-LISTEN:{host_socket_path},fork", + "STDOUT", + ] + host_socat = subprocess.Popen( + host_socat_commmand, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + # Give some time for host socat to create socket + time.sleep(0.5) + assert Path(host_socket_path).exists() + new_vm.create_jailed_resource(host_socket_path) + + # Create a socat process in the guest which will connect to the host socat + guest_socat_commmand = ( + f"tmux new -d 'socat - vsock-connect:2:{ECHO_SERVER_PORT}'" + ) + new_vm.ssh.run(guest_socat_commmand) + + # socat should be running in the guest now + code, _, _ = new_vm.ssh.run("pidof socat") + assert code == 0 + + # Create snapshot. + snapshot = new_vm.snapshot_full() + new_vm.resume() + + # After `create_snapshot` + 'restore' calls, connection should be dropped + code, _, _ = new_vm.ssh.run("pidof socat") + assert code == 1 + + # Kill host socat as it is not useful anymore + host_socat.kill() + host_socat.communicate() + + # Terminate VM. + new_vm.kill()