From f33b0a055af1ede6fa4567287b93ae581f0c78c5 Mon Sep 17 00:00:00 2001 From: flyflypeng Date: Wed, 26 Jul 2023 17:23:59 +0800 Subject: [PATCH] [vmm]: support sandbox cgroup management in the host fix: #51 reason: 1. limit the vm process resource usage in the specified cgroup dir 2. support pod_overhead conception actually, which limits the non-vcpu type threads of vm process into pod_overhead cgroup dir and set the pod_overhead resources limit in this cgroup dir. Signed-off-by: flyflypeng --- .github/workflows/ci.yml | 2 + vmm/sandbox/Cargo.lock | 17 +- vmm/sandbox/Cargo.toml | 2 + vmm/sandbox/src/cgroup.rs | 389 ++++++++++++++++++++++++ vmm/sandbox/src/cloud_hypervisor/mod.rs | 16 +- vmm/sandbox/src/main.rs | 1 + vmm/sandbox/src/qemu/config.rs | 1 + vmm/sandbox/src/qemu/mod.rs | 15 +- vmm/sandbox/src/sandbox.rs | 70 ++++- vmm/sandbox/src/stratovirt/mod.rs | 38 ++- vmm/sandbox/src/stratovirt/qmp.rs | 112 +++++++ vmm/sandbox/src/utils.rs | 14 + vmm/sandbox/src/vm.rs | 18 +- 13 files changed, 683 insertions(+), 12 deletions(-) create mode 100644 vmm/sandbox/src/cgroup.rs create mode 100644 vmm/sandbox/src/stratovirt/qmp.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4ceb872..0f42f0e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -63,5 +63,7 @@ jobs: if: ${{ matrix.wasmEdge }} run: curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- -v ${{ matrix.wasmEdge }} >> /dev/null - name: Test + env: + RUST_BACKTRACE: full working-directory: ${{ matrix.directories }} run: sudo -E $(command -v cargo) test --all-features diff --git a/vmm/sandbox/Cargo.lock b/vmm/sandbox/Cargo.lock index 0df7b560..4df24072 100644 --- a/vmm/sandbox/Cargo.lock +++ b/vmm/sandbox/Cargo.lock @@ -247,6 +247,19 @@ dependencies = [ "regex", ] +[[package]] +name = "cgroups-rs" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b098e7c3a70d03c288fa0a96ccf13e770eb3d78c4cc0e1549b3c13215d5f965" +dependencies = [ + "libc", + "log", + "nix 0.25.1", + "regex", + "thiserror", +] + [[package]] name = "chrono" version = "0.4.26" @@ -315,7 +328,7 @@ version = "0.3.0" source = "git+https://github.com/kuasar-io/rust-extensions.git#6ae99540b754cd28c5389d5d6fdeff6ec7290ec5" dependencies = [ "async-trait", - "cgroups-rs", + "cgroups-rs 0.2.11", "command-fds", "containerd-shim-protos", "futures", @@ -2328,6 +2341,7 @@ dependencies = [ "api_client", "async-trait", "bytefmt", + "cgroups-rs 0.3.2", "containerd-sandbox", "containerd-shim", "env_logger", @@ -2343,6 +2357,7 @@ dependencies = [ "prost-types 0.10.1", "protobuf 3.2.0", "qapi", + "qapi-spec", "rand", "rtnetlink", "sandbox-derive", diff --git a/vmm/sandbox/Cargo.toml b/vmm/sandbox/Cargo.toml index 180af2ac..e5d450d5 100644 --- a/vmm/sandbox/Cargo.toml +++ b/vmm/sandbox/Cargo.toml @@ -39,6 +39,7 @@ uuid = { version = "1.1.2", features = ["v4"] } unshare = { version = "0.7.0", optional = true } os_pipe = { version = "0.9.2", optional = true } qapi = { version = "0.8.0", features = ["qmp", "async-tokio-all"], optional = true } +qapi-spec = {version = "0.3.1"} sandbox-derive = { path = "derive" } api_client = { git = "https://github.com/cloud-hypervisor/cloud-hypervisor.git", optional = true } rtnetlink = "0.12" @@ -46,3 +47,4 @@ netlink-packet-route = "0.15" netlink-packet-core = "0.5.0" ttrpc = { version = "0.7", features = ["async"] } protobuf = "3.2" +cgroups-rs = "0.3.2" diff --git a/vmm/sandbox/src/cgroup.rs b/vmm/sandbox/src/cgroup.rs new file mode 100644 index 00000000..db9500d8 --- /dev/null +++ b/vmm/sandbox/src/cgroup.rs @@ -0,0 +1,389 @@ +use anyhow::{anyhow, Ok, Result}; +use cgroups_rs::{ + cgroup_builder::*, cpu::CpuController, cpuset::CpuSetController, hugetlb::HugeTlbController, + memory::MemController, Cgroup, +}; +use containerd_sandbox::{cri::api::v1::LinuxContainerResources, data::SandboxData}; +use serde::{Deserialize, Serialize}; + +use crate::{ + utils::{get_overhead_resources, get_resources, get_total_resources}, + vm::VcpuThreads, +}; + +pub const VCPU_CGROUP_NAME: &str = "vcpu"; +pub const POD_OVERHEAD_CGROUP_NAME: &str = "pod_overhead"; + +/// `SandboxCgroup` represents a set of cgroups for a sandbox. +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct SandboxCgroup { + pub cgroup_parent_path: String, + #[serde(skip)] + pub sandbox_cgroup: Cgroup, + #[serde(skip)] + pub vcpu_cgroup: Cgroup, + #[serde(skip)] + pub pod_overhead_cgroup: Cgroup, +} + +impl SandboxCgroup { + pub fn create_sandbox_cgroups(cgroup_parent_path: &str, sandbox_id: &str) -> Result { + // Create sandbox cgroup in the all cgroup subsystem dir + let sandbox_cgroup_path = format!("{}/{}", cgroup_parent_path, sandbox_id); + // CgroupBuilder::new() func doesn't accept the cgroup name has "/" prefix, + // So need to remove the "/" prefix for sandbox_cgroup_path + let sandbox_cgroup_rela_path = sandbox_cgroup_path.trim_start_matches('/'); + + let sandbox_cgroup = + CgroupBuilder::new(sandbox_cgroup_rela_path).build(cgroups_rs::hierarchies::auto())?; + + // Only create the vcpu and pod_overhead cgroups in the cpu cgroup subsystem + let vcpu_cgroup_path = format!("{}/{}", sandbox_cgroup_rela_path, VCPU_CGROUP_NAME); + let vcpu_cgroup = CgroupBuilder::new(vcpu_cgroup_path.as_str()) + .set_specified_controllers(vec!["cpu".to_string()]) + .build(cgroups_rs::hierarchies::auto())?; + + let pod_overhead_cgroup_path = + format!("{}/{}", sandbox_cgroup_rela_path, POD_OVERHEAD_CGROUP_NAME); + let pod_overhead_cgroup = CgroupBuilder::new(pod_overhead_cgroup_path.as_str()) + .set_specified_controllers(vec!["cpu".to_string()]) + .build(cgroups_rs::hierarchies::auto())?; + + Ok(SandboxCgroup { + cgroup_parent_path: cgroup_parent_path.to_string(), + sandbox_cgroup, + vcpu_cgroup, + pod_overhead_cgroup, + }) + } + + pub fn update_res_for_sandbox_cgroups(&self, sandbox_data: &SandboxData) -> Result<()> { + // apply the total resources = sum(sum(containers_resources) + pod_overhead)) in the sandbox cgroup dir + if let Some(total_resources) = get_total_resources(sandbox_data) { + apply_cpu_resource(&self.sandbox_cgroup, &total_resources)?; + apply_memory_resource(&self.sandbox_cgroup, &total_resources)?; + apply_cpuset_resources(&self.sandbox_cgroup, &total_resources)?; + apply_hugetlb_resources(&self.sandbox_cgroup, &total_resources)?; + } + + // apply the cpu resource of containers in the vcpu cpu subsystem cgroup + if let Some(containers_resources) = get_resources(sandbox_data) { + apply_cpu_resource(&self.vcpu_cgroup, containers_resources)?; + } + + // apply the cpu resource of pod_overhead in the pod_overhead cpu subsystem cgroup + if let Some(overhead_resources) = get_overhead_resources(sandbox_data) { + apply_cpu_resource(&self.pod_overhead_cgroup, overhead_resources)?; + } + + Ok(()) + } + + pub fn add_process_into_sandbox_cgroups( + &self, + pid: u32, + vcpu_threads: Option, + ) -> Result<()> { + // Add vmm process into the sandbox_cgroup + self.sandbox_cgroup.add_task_by_tgid((pid as u64).into())?; + self.pod_overhead_cgroup + .add_task_by_tgid((pid as u64).into())?; + + if let Some(all_vcpu_threads) = vcpu_threads { + // Move vmm process from parent sandbox cgroup into pod_overhead cgroup + // Then move the all vcpu threads of vmm process into vcpu cgroup + for (_, vcpu_thread_tid) in all_vcpu_threads.vcpus { + self.vcpu_cgroup.add_task((vcpu_thread_tid as u64).into())?; + } + } + + Ok(()) + } + + pub fn remove_sandbox_cgroups(&self) -> Result<()> { + remove_sandbox_cgroup(&self.vcpu_cgroup)?; + remove_sandbox_cgroup(&self.pod_overhead_cgroup)?; + remove_sandbox_cgroup(&self.sandbox_cgroup)?; + Ok(()) + } +} + +fn apply_cpu_resource(cgroup: &Cgroup, res: &LinuxContainerResources) -> Result<()> { + let cpu_controller: &CpuController = cgroup + .controller_of() + .ok_or_else(|| anyhow!("No cpu controller attached!"))?; + + if res.cpu_period != 0 { + cpu_controller.set_cfs_period(res.cpu_period.try_into()?)?; + } + if res.cpu_quota != 0 { + cpu_controller.set_cfs_quota(res.cpu_quota)?; + } + if res.cpu_shares != 0 { + cpu_controller.set_shares(res.cpu_shares.try_into()?)?; + } + + Ok(()) +} + +fn apply_memory_resource(cgroup: &Cgroup, res: &LinuxContainerResources) -> Result<()> { + let mem_controller: &MemController = cgroup + .controller_of() + .ok_or_else(|| anyhow!("No memory controller attached!"))?; + + if res.memory_limit_in_bytes != 0 { + mem_controller.set_limit(res.memory_limit_in_bytes)?; + } + if res.memory_swap_limit_in_bytes != 0 { + mem_controller.set_memswap_limit(res.memory_swap_limit_in_bytes)?; + } + + Ok(()) +} + +fn apply_cpuset_resources(cgroup: &Cgroup, res: &LinuxContainerResources) -> Result<()> { + let cpuset_controller: &CpuSetController = cgroup + .controller_of() + .ok_or_else(|| anyhow!("No cpuset controller attached!"))?; + + if !res.cpuset_cpus.is_empty() { + cpuset_controller.set_cpus(&res.cpuset_cpus)?; + } + if !res.cpuset_mems.is_empty() { + cpuset_controller.set_mems(&res.cpuset_mems)?; + } + + Ok(()) +} + +fn apply_hugetlb_resources(cgroup: &Cgroup, res: &LinuxContainerResources) -> Result<()> { + let hugetlb_controller: &HugeTlbController = cgroup + .controller_of() + .ok_or_else(|| anyhow!("No hugetlb controller attached!"))?; + for h in res.hugepage_limits.iter() { + hugetlb_controller.set_limit_in_bytes(h.page_size.as_str(), h.limit)?; + } + Ok(()) +} + +fn remove_sandbox_cgroup(cgroup: &Cgroup) -> Result<()> { + // get the tids in the current cgroup and then move the tids to parent cgroup + let tids = cgroup.tasks(); + for tid in tids { + cgroup.move_task_to_parent(tid)?; + } + + cgroup.delete()?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::{collections::HashMap, result::Result::Ok}; + + use cgroups_rs::Controller; + use containerd_sandbox::{ + cri::api::v1::{HugepageLimit, LinuxPodSandboxConfig}, + PodSandboxConfig, + }; + + use super::*; + use crate::utils::get_sandbox_cgroup_parent_path; + + fn create_mock_pod_sandbox_config() -> PodSandboxConfig { + let mut pod_sandbox_config = PodSandboxConfig::default(); + pod_sandbox_config.linux = Some(LinuxPodSandboxConfig { + cgroup_parent: "/kubepods/burstable/podxxx".to_string(), + security_context: None, + sysctls: HashMap::new(), + overhead: Some(LinuxContainerResources { + cpu_period: 100000, + cpu_quota: 50000, + cpu_shares: 1024, + memory_limit_in_bytes: 100 * 1024 * 1024, + oom_score_adj: 0, + cpuset_cpus: "".to_string(), + cpuset_mems: "".to_string(), + hugepage_limits: vec![], + unified: HashMap::new(), + memory_swap_limit_in_bytes: 0 + 100 * 1024 * 1024, + }), + resources: Some(LinuxContainerResources { + cpu_period: 100000, + cpu_quota: 200000, + cpu_shares: 1024, + memory_limit_in_bytes: 1024 * 1024 * 1024, + oom_score_adj: 0, + cpuset_cpus: "0-1".to_string(), + cpuset_mems: "0".to_string(), + hugepage_limits: vec![HugepageLimit { + page_size: "2MB".to_string(), + limit: 2 * 1024 * 1024 * 1024, + }], + unified: HashMap::new(), + memory_swap_limit_in_bytes: 0 + 1024 * 1024 * 1024, + }), + }); + pod_sandbox_config + } + + #[test] + fn test_create_sandbox_cgroups() { + // Currently only support cgroup V1, cgroup V2 is not supported now + if cgroups_rs::hierarchies::is_cgroup2_unified_mode() { + return; + } + + // Create a SandboxData instance + let mut sandbox_data = SandboxData::default(); + sandbox_data.id = String::from("test_sandbox"); + + // Case 1: sandbox.config is None, expect return the error + sandbox_data.config = None; + let sandbox_cgroup_path = get_sandbox_cgroup_parent_path(&sandbox_data); + assert_eq!(sandbox_cgroup_path.is_none(), true); + + // Case 2: sandbox.config is corrent, expect create sandbox cgroup successfully + let pod_sandbox_config = create_mock_pod_sandbox_config(); + sandbox_data.config = Some(pod_sandbox_config); + let sandbox_cgroup_path = get_sandbox_cgroup_parent_path(&sandbox_data).unwrap(); + let result = SandboxCgroup::create_sandbox_cgroups(&sandbox_cgroup_path, &sandbox_data.id); + match result { + Ok(sandbox_cgoups) => { + // Get the test environment cpu subsystem cgroup mountpoint path + let cpu_cgroup_root_pathbuf = cgroups_rs::hierarchies::V1::new() + .get_mount_point(cgroups_rs::Controllers::Cpu) + .unwrap(); + println!("cpu_cgroup_root_pathbuf: {:?}", cpu_cgroup_root_pathbuf); + + // check sandbox level cgroup + let sandbox_cgroup_cpu_controller: &CpuController = + sandbox_cgoups.sandbox_cgroup.controller_of().unwrap(); + assert!(sandbox_cgroup_cpu_controller.path().exists()); + assert_eq!( + sandbox_cgroup_cpu_controller.path(), + cpu_cgroup_root_pathbuf + .join("kubepods/burstable/podxxx/test_sandbox") + .as_path() + ); + + // check vcpu level cgroup + let vcpu_cgroup_cpu_controller: &CpuController = + sandbox_cgoups.vcpu_cgroup.controller_of().unwrap(); + assert!(vcpu_cgroup_cpu_controller.path().exists()); + assert_eq!( + vcpu_cgroup_cpu_controller.path(), + cpu_cgroup_root_pathbuf + .join("kubepods/burstable/podxxx/test_sandbox/vcpu") + .as_path() + ); + } + Err(e) => panic!("Expected an Ok, but got error: {}", e.to_string()), + } + + // Case 3: If sandbox cgroups already exist in the system, then call create_sandbox_cgroups + // function again will not fail + let result = SandboxCgroup::create_sandbox_cgroups(&sandbox_cgroup_path, &sandbox_data.id); + match result { + Ok(sandbox_cgoups) => { + let sandbox_cgroup_mem_controller: &MemController = + sandbox_cgoups.sandbox_cgroup.controller_of().unwrap(); + assert!(sandbox_cgroup_mem_controller.path().exists()); + assert_eq!( + sandbox_cgroup_mem_controller.path().to_str().unwrap(), + "/sys/fs/cgroup/memory/kubepods/burstable/podxxx/test_sandbox" + ); + + // Clean the test sandbox cgroups + assert_eq!(sandbox_cgoups.remove_sandbox_cgroups().is_ok(), true); + } + Err(e) => panic!("Expected an Ok, but got error: {}", e.to_string()), + } + } + + #[test] + fn test_update_res_for_sandbox_cgroups_success() { + // Currently only support cgroup V1, cgroup V2 is not supported now + if cgroups_rs::hierarchies::is_cgroup2_unified_mode() { + return; + } + + // Case 1: successfully case + // Create a SandboxData instance + let mut sandbox_data = SandboxData::default(); + sandbox_data.id = String::from("test_sandbox"); + sandbox_data.config = Some(create_mock_pod_sandbox_config()); + + // Create a SandboxData instance + let sandbox_cgroup_path = get_sandbox_cgroup_parent_path(&sandbox_data).unwrap(); + let sandbox_cgroups = + SandboxCgroup::create_sandbox_cgroups(&sandbox_cgroup_path, &sandbox_data.id).unwrap(); + let result = sandbox_cgroups.update_res_for_sandbox_cgroups(&sandbox_data); + + match result { + Ok(_) => { + // Check sandbox level cgroup total resources + // Check cpu subsystem cgroup limit + let sandbox_cgroup_cpu_controller: &CpuController = + sandbox_cgroups.sandbox_cgroup.controller_of().unwrap(); + assert_eq!(sandbox_cgroup_cpu_controller.cfs_period().unwrap(), 100000); + assert_eq!(sandbox_cgroup_cpu_controller.cfs_quota().unwrap(), 250000); + assert_eq!(sandbox_cgroup_cpu_controller.shares().unwrap(), 2048); + + // Check memory subsystem cgroup limit + let sandbox_cgroup_mem_controller: &MemController = + sandbox_cgroups.sandbox_cgroup.controller_of().unwrap(); + let memory_stats = sandbox_cgroup_mem_controller.memory_stat(); + let memory_swap_stats = sandbox_cgroup_mem_controller.memswap(); + assert_eq!(memory_stats.limit_in_bytes, 1124 * 1024 * 1024); + assert_eq!(memory_swap_stats.limit_in_bytes, 1124 * 1024 * 1024); + + // Check cpuset subsystem cgroup limit + let sandbox_cgroup_cpuset_controller: &CpuSetController = + sandbox_cgroups.sandbox_cgroup.controller_of().unwrap(); + let cpuset_stats = sandbox_cgroup_cpuset_controller.cpuset(); + assert_eq!(cpuset_stats.cpus, vec![(0, 1)]); + assert_eq!(cpuset_stats.mems, vec![(0, 0)]); + + // Check hugetlb subsystem cgroup limit + let sandbox_cgroup_hugetlb_controller: &HugeTlbController = + sandbox_cgroups.sandbox_cgroup.controller_of().unwrap(); + assert_eq!( + sandbox_cgroup_hugetlb_controller + .get_sizes() + .contains(&"2MB".to_string()), + true + ); + assert_eq!( + sandbox_cgroup_hugetlb_controller + .limit_in_bytes("2MB") + .unwrap(), + 2 * 1024 * 1024 * 1024 + ); + + // check vcpu level cgroup + let vcpu_cgroup_cpu_controller: &CpuController = + sandbox_cgroups.vcpu_cgroup.controller_of().unwrap(); + assert_eq!(vcpu_cgroup_cpu_controller.cfs_period().unwrap(), 100000); + assert_eq!(vcpu_cgroup_cpu_controller.cfs_quota().unwrap(), 200000); + assert_eq!(vcpu_cgroup_cpu_controller.shares().unwrap(), 1024); + + // check pod_overhead level cgroup + let pod_overhead_cgroup_cpu_controller: &CpuController = + sandbox_cgroups.pod_overhead_cgroup.controller_of().unwrap(); + assert_eq!( + pod_overhead_cgroup_cpu_controller.cfs_period().unwrap(), + 100000 + ); + assert_eq!( + pod_overhead_cgroup_cpu_controller.cfs_quota().unwrap(), + 50000 + ); + assert_eq!(pod_overhead_cgroup_cpu_controller.shares().unwrap(), 1024); + } + Err(e) => panic!("Expected an Ok, but got error: {}", e.to_string()), + } + + assert_eq!(sandbox_cgroups.remove_sandbox_cgroups().is_ok(), true); + } +} diff --git a/vmm/sandbox/src/cloud_hypervisor/mod.rs b/vmm/sandbox/src/cloud_hypervisor/mod.rs index eb73556a..5c1b2b5b 100644 --- a/vmm/sandbox/src/cloud_hypervisor/mod.rs +++ b/vmm/sandbox/src/cloud_hypervisor/mod.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::{os::unix::io::RawFd, process::Stdio}; +use std::{collections::HashMap, os::unix::io::RawFd, process::Stdio}; use anyhow::anyhow; use async_trait::async_trait; @@ -40,7 +40,7 @@ use crate::{ impl_recoverable, param::ToCmdLineParams, utils::{read_file, read_std, set_cmd_fd, set_cmd_netns, wait_pid, write_file_atomic}, - vm::VM, + vm::{Pids, VcpuThreads, VM}, }; mod client; @@ -245,6 +245,18 @@ impl VM for CloudHypervisorVM { async fn wait_channel(&self) -> Option> { return self.wait_chan.clone(); } + + async fn vcpus(&self) -> Result { + // TODO: support get vcpu threads id + Ok(VcpuThreads { + vcpus: HashMap::new(), + }) + } + + fn pids(&self) -> Pids { + // TODO: support get all vmm related pids + Pids::default() + } } impl_recoverable!(CloudHypervisorVM); diff --git a/vmm/sandbox/src/main.rs b/vmm/sandbox/src/main.rs index 32d3e3c5..60d81d36 100644 --- a/vmm/sandbox/src/main.rs +++ b/vmm/sandbox/src/main.rs @@ -74,6 +74,7 @@ cfg_stratovirt! { #[macro_use] mod device; +mod cgroup; mod container; mod io; mod kata_config; diff --git a/vmm/sandbox/src/qemu/config.rs b/vmm/sandbox/src/qemu/config.rs index 8cbdab05..172df4d8 100644 --- a/vmm/sandbox/src/qemu/config.rs +++ b/vmm/sandbox/src/qemu/config.rs @@ -17,6 +17,7 @@ limitations under the License. use std::{collections::HashMap, os::unix::io::RawFd}; use containerd_sandbox::error::{Error, Result}; +#[cfg(target_arch = "x86_64")] use lazy_static::lazy_static; use sandbox_derive::{CmdLineParamSet, CmdLineParams}; use serde::{Deserialize, Serialize}; diff --git a/vmm/sandbox/src/qemu/mod.rs b/vmm/sandbox/src/qemu/mod.rs index f884a4cf..1cbf160e 100644 --- a/vmm/sandbox/src/qemu/mod.rs +++ b/vmm/sandbox/src/qemu/mod.rs @@ -15,6 +15,7 @@ limitations under the License. */ use std::{ + collections::HashMap, os::unix::io::{AsRawFd, FromRawFd, RawFd}, time::{Duration, SystemTime}, }; @@ -54,7 +55,7 @@ use crate::{ utils::detect_pid, }, utils::{read_file, read_std, wait_channel, wait_pid}, - vm::{BlockDriver, VM}, + vm::{BlockDriver, Pids, VcpuThreads, VM}, }; pub mod config; @@ -298,6 +299,18 @@ impl VM for QemuVM { async fn wait_channel(&self) -> Option> { return self.wait_chan.clone(); } + + async fn vcpus(&self) -> Result { + // TODO: support get vcpu threads id + Ok(VcpuThreads { + vcpus: HashMap::new(), + }) + } + + fn pids(&self) -> Pids { + // TODO: support get all vmm related pids + Pids::default() + } } impl QemuVM { diff --git a/vmm/sandbox/src/sandbox.rs b/vmm/sandbox/src/sandbox.rs index c66d3e70..034b1d66 100644 --- a/vmm/sandbox/src/sandbox.rs +++ b/vmm/sandbox/src/sandbox.rs @@ -25,7 +25,7 @@ use containerd_sandbox::{ utils::cleanup_mounts, ContainerOption, Sandbox, SandboxOption, SandboxStatus, Sandboxer, }; -use log::{error, info, warn}; +use log::{debug, error, info, warn}; use serde::{de::DeserializeOwned, Deserialize, Serialize}; use tokio::{ fs::{remove_dir_all, OpenOptions}, @@ -35,13 +35,14 @@ use tokio::{ use vmm_common::{api::sandbox_ttrpc::SandboxServiceClient, storage::Storage, SHARED_DIR_SUFFIX}; use crate::{ + cgroup::SandboxCgroup, client::{ client_check, client_sync_clock, client_update_interfaces, client_update_routes, new_sandbox_client, }, container::KuasarContainer, network::{Network, NetworkConfig}, - utils::get_resources, + utils::{get_resources, get_sandbox_cgroup_parent_path}, vm::{Hooks, Recoverable, VMFactory, VM}, }; @@ -155,6 +156,8 @@ pub struct KuasarSandbox { pub(crate) client: Arc>>, #[serde(skip, default)] pub(crate) exit_signal: Arc, + #[serde(default)] + pub(crate) sandbox_cgroups: SandboxCgroup, } #[async_trait] @@ -171,6 +174,31 @@ where return Err(Error::AlreadyExist("sandbox".to_string())); } + let mut sandbox_cgroups = SandboxCgroup::default(); + let cgroup_parent_path = match get_sandbox_cgroup_parent_path(&s.sandbox) { + Some(cgroup_parent_path) => cgroup_parent_path, + None => { + return Err(Error::Other(anyhow!( + "Failed to get sandbox cgroup parent path." + ))) + } + }; + // Currently only support cgroup V1, cgroup V2 is not supported now + if !cgroups_rs::hierarchies::is_cgroup2_unified_mode() { + // Create sandbox's cgroup and apply sandbox's resources limit + let create_and_update_sandbox_cgroup = (|| { + sandbox_cgroups = + SandboxCgroup::create_sandbox_cgroups(&cgroup_parent_path, &s.sandbox.id)?; + sandbox_cgroups.update_res_for_sandbox_cgroups(&s.sandbox)?; + Ok(()) + })(); + // If create and update sandbox cgroup failed, do rollback operation + if let Err(e) = create_and_update_sandbox_cgroup { + let _ = sandbox_cgroups.remove_sandbox_cgroups(); + return Err(e); + } + } + // TODO support network let vm = self.factory.create_vm(id, &s).await?; let mut sandbox = KuasarSandbox { @@ -185,6 +213,7 @@ where network: None, client: Arc::new(Mutex::new(None)), exit_signal: Arc::new(ExitSignal::default()), + sandbox_cgroups, }; // Handle pod network if it has a private network namespace @@ -221,6 +250,32 @@ where let mut sandbox = sandbox_mutex.lock().await; self.hooks.pre_start(&mut sandbox).await?; sandbox.start().await?; + + // Currently only support cgroup V1, cgroup V2 is not supported now + if !cgroups_rs::hierarchies::is_cgroup2_unified_mode() { + // add vmm process into sandbox cgroup + if let SandboxStatus::Running(vmm_pid) = sandbox.status { + let vcpu_threads = sandbox.vm.vcpus().await?; + debug!( + "vmm process pid: {}, vcpu threads pid: {:?}", + vmm_pid, vcpu_threads + ); + sandbox + .sandbox_cgroups + .add_process_into_sandbox_cgroups(vmm_pid, Some(vcpu_threads))?; + // move the virtiofsd process into sandbox cgroup + if let Some(virtiofsd_pid) = sandbox.vm.pids().virtiofsd_pid { + sandbox + .sandbox_cgroups + .add_process_into_sandbox_cgroups(virtiofsd_pid, None)?; + } + } else { + return Err(Error::Other(anyhow!( + "sandbox status is not Running after started!" + ))); + } + } + let sandbox_clone = sandbox_mutex.clone(); monitor(sandbox_clone); self.hooks.post_start(&mut sandbox).await?; @@ -253,6 +308,13 @@ where if let Some(sb_mutex) = sb_clone.get(id) { let mut sb = sb_mutex.lock().await; sb.stop(true).await?; + + // Currently only support cgroup V1, cgroup V2 is not supported now + if !cgroups_rs::hierarchies::is_cgroup2_unified_mode() { + // remove the sandbox cgroups + sb.sandbox_cgroups.remove_sandbox_cgroups()?; + } + cleanup_mounts(&sb.base_dir).await?; remove_dir_all(&sb.base_dir).await?; } @@ -373,6 +435,10 @@ where if let SandboxStatus::Running(_) = sb.status { sb.vm.recover().await?; } + // recover the sandbox_cgroups in the sandbox object + sb.sandbox_cgroups = + SandboxCgroup::create_sandbox_cgroups(&sb.sandbox_cgroups.cgroup_parent_path, &sb.id)?; + Ok(sb) } } diff --git a/vmm/sandbox/src/stratovirt/mod.rs b/vmm/sandbox/src/stratovirt/mod.rs index f110c253..4f82dc5b 100644 --- a/vmm/sandbox/src/stratovirt/mod.rs +++ b/vmm/sandbox/src/stratovirt/mod.rs @@ -15,6 +15,7 @@ limitations under the License. */ use std::{ + collections::HashMap, os::unix::io::{AsRawFd, FromRawFd, RawFd}, time::{Duration, SystemTime}, }; @@ -26,6 +27,7 @@ use futures_util::TryFutureExt; use log::{debug, error, trace, warn}; use nix::{fcntl::OFlag, libc::kill, sys::stat::Mode}; use qapi::qmp::quit; +use qmp::CpuInfo; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use tokio::{ @@ -52,13 +54,14 @@ use crate::{ virtiofs::VirtiofsDaemon, }, utils::{read_file, read_std, wait_channel, wait_pid}, - vm::{BlockDriver, VM}, + vm::{BlockDriver, Pids, VcpuThreads, VM}, }; pub mod config; mod devices; pub mod factory; pub mod hooks; +mod qmp; mod qmp_client; mod utils; mod virtiofs; @@ -80,6 +83,7 @@ pub struct StratoVirtVM { console_socket: String, agent_socket: String, netns: String, + pids: Pids, #[serde(skip)] block_driver: BlockDriver, #[serde(skip)] @@ -131,8 +135,15 @@ impl VM for StratoVirtVM { error!("failed to read console log, {}", e); }); }); - // TODO return stratovirt pid - Ok(0) + + // update vmm related pids + let vmm_pid = detect_pid(self.config.pid_file.as_str(), self.config.path.as_str()).await?; + self.pids.vmm_pid = Some(vmm_pid); + if let Some(virtiofsd) = &self.virtiofs_daemon { + self.pids.virtiofsd_pid = virtiofsd.pid; + } + + Ok(vmm_pid) } async fn stop(&mut self, force: bool) -> Result<()> { @@ -238,6 +249,26 @@ impl VM for StratoVirtVM { async fn wait_channel(&self) -> Option> { return self.wait_chan.clone(); } + + async fn vcpus(&self) -> Result { + let client = self.get_client()?; + let result = client.execute(qmp::query_cpus {}).await?; + let mut vcpu_threads_map: HashMap = HashMap::new(); + for vcpu_info in result.iter() { + match vcpu_info { + CpuInfo::Arm { base, .. } | CpuInfo::x86 { base, .. } => { + vcpu_threads_map.insert(base.CPU, base.thread_id); + } + } + } + Ok(VcpuThreads { + vcpus: vcpu_threads_map, + }) + } + + fn pids(&self) -> Pids { + self.pids.clone() + } } impl StratoVirtVM { @@ -257,6 +288,7 @@ impl StratoVirtVM { virtiofs_daemon: None, pcie_root_ports_pool: None, pcie_root_bus: PcieRootBus::default(), + pids: Pids::default(), } } diff --git a/vmm/sandbox/src/stratovirt/qmp.rs b/vmm/sandbox/src/stratovirt/qmp.rs new file mode 100644 index 00000000..185357c9 --- /dev/null +++ b/vmm/sandbox/src/stratovirt/qmp.rs @@ -0,0 +1,112 @@ +#![allow(warnings)] + +use qapi::qmp::QmpCommand; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct query_cpus {} + +impl QmpCommand for query_cpus {} +impl ::qapi_spec::Command for query_cpus { + const NAME: &'static str = "query-cpus"; + const ALLOW_OOB: bool = false; + + type Ok = Vec; +} + +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CpuInfoArch { + #[serde(rename = "x86")] + x86, + #[serde(rename = "Arm")] + Arm, +} + +impl ::core::str::FromStr for CpuInfoArch { + type Err = (); + + fn from_str(s: &str) -> Result { + ::qapi_spec::Enum::from_name(s).ok_or(()) + } +} + +unsafe impl ::qapi_spec::Enum for CpuInfoArch { + fn discriminant(&self) -> usize { + *self as usize + } + + const COUNT: usize = 2; + const VARIANTS: &'static [Self] = &[CpuInfoArch::x86, CpuInfoArch::Arm]; + const NAMES: &'static [&'static str] = &["x86", "Arm"]; +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "arch")] +pub enum CpuInfo { + #[serde(rename = "arm")] + Arm { + #[serde(flatten)] + #[serde(rename = "base")] + base: CpuInfoBase, + #[serde(flatten)] + #[serde(rename = "Arm")] + Arm: CpuInfoArm, + }, + #[serde(rename = "x86")] + x86 { + #[serde(flatten)] + #[serde(rename = "base")] + base: CpuInfoBase, + #[serde(flatten)] + #[serde(rename = "x86")] + x86: CpuInfoX86, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CpuInfoBase { + #[serde(rename = "CPU")] + pub CPU: i64, + #[serde(rename = "current")] + pub current: bool, + #[serde(rename = "halted")] + pub halted: bool, + #[serde(rename = "qom_path")] + pub qom_path: ::std::string::String, + #[serde(rename = "thread_id")] + pub thread_id: i64, +} + +impl CpuInfo { + pub fn arch(&self) -> CpuInfoArch { + match *self { + CpuInfo::Arm { .. } => CpuInfoArch::Arm, + + CpuInfo::x86 { .. } => CpuInfoArch::x86, + } + } +} + +impl From<(CpuInfoArm, CpuInfoBase)> for CpuInfo { + fn from(val: (CpuInfoArm, CpuInfoBase)) -> Self { + Self::Arm { + Arm: val.0, + base: val.1, + } + } +} + +impl From<(CpuInfoX86, CpuInfoBase)> for CpuInfo { + fn from(val: (CpuInfoX86, CpuInfoBase)) -> Self { + Self::x86 { + x86: val.0, + base: val.1, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct CpuInfoArm {} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct CpuInfoX86 {} diff --git a/vmm/sandbox/src/utils.rs b/vmm/sandbox/src/utils.rs index ff052225..4c998db3 100644 --- a/vmm/sandbox/src/utils.rs +++ b/vmm/sandbox/src/utils.rs @@ -79,6 +79,13 @@ pub fn get_resources(data: &SandboxData) -> Option<&LinuxContainerResources> { .and_then(|l| l.resources.as_ref()) } +pub fn get_overhead_resources(data: &SandboxData) -> Option<&LinuxContainerResources> { + data.config + .as_ref() + .and_then(|c| c.linux.as_ref()) + .and_then(|l| l.overhead.as_ref()) +} + #[allow(dead_code)] pub fn get_total_resources(data: &SandboxData) -> Option { return data @@ -458,3 +465,10 @@ pub fn set_cmd_fd(cmd: &mut Command, fds: Vec) -> Result<()> { }; Ok(()) } + +pub fn get_sandbox_cgroup_parent_path(data: &SandboxData) -> Option { + data.config + .as_ref() + .and_then(|c| c.linux.as_ref()) + .map(|l| l.cgroup_parent.clone()) +} diff --git a/vmm/sandbox/src/vm.rs b/vmm/sandbox/src/vm.rs index 59cc4738..20cd8453 100644 --- a/vmm/sandbox/src/vm.rs +++ b/vmm/sandbox/src/vm.rs @@ -14,15 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::str::FromStr; +use std::{collections::HashMap, str::FromStr}; use async_trait::async_trait; use containerd_sandbox::{ error::{Error, Result}, SandboxOption, }; -use serde::Serialize; -use serde_derive::Deserialize; +use serde::{Deserialize, Serialize}; use tokio::sync::watch::Receiver; use crate::{ @@ -70,6 +69,8 @@ pub trait VM: Serialize + Sync + Send { async fn ping(&self) -> Result<()>; fn socket_address(&self) -> String; async fn wait_channel(&self) -> Option>; + async fn vcpus(&self) -> Result; + fn pids(&self) -> Pids; } #[macro_export] @@ -200,3 +201,14 @@ impl FromStr for ShareFsType { } } } + +#[derive(Debug)] +pub struct VcpuThreads { + pub vcpus: HashMap, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct Pids { + pub vmm_pid: Option, + pub virtiofsd_pid: Option, +}