diff --git a/agent/src/config/config.rs b/agent/src/config/config.rs index 0a92184034c..ca987f88d84 100644 --- a/agent/src/config/config.rs +++ b/agent/src/config/config.rs @@ -1969,6 +1969,7 @@ pub struct Tunning { pub cpu_affinity: Vec, pub process_scheduling_priority: usize, pub idle_memory_trimming: bool, + pub page_cache_reclaim_percentage: u8, #[serde(with = "humantime_serde")] pub resource_monitoring_interval: Duration, } @@ -1979,6 +1980,7 @@ impl Default for Tunning { cpu_affinity: vec![], process_scheduling_priority: 0, idle_memory_trimming: false, + page_cache_reclaim_percentage: 100, resource_monitoring_interval: Duration::from_secs(10), } } diff --git a/agent/src/config/handler.rs b/agent/src/config/handler.rs index ae028c5693a..3d1914c40a1 100755 --- a/agent/src/config/handler.rs +++ b/agent/src/config/handler.rs @@ -207,6 +207,7 @@ pub struct EnvironmentConfig { pub system_load_circuit_breaker_threshold: f32, pub system_load_circuit_breaker_recover: f32, pub system_load_circuit_breaker_metric: agent::SystemLoadMetric, + pub page_cache_reclaim_percentage: u8, } #[derive(Clone, PartialEq, Eq, Debug)] @@ -1602,6 +1603,7 @@ impl TryFrom<(Config, UserConfig)> for ModuleConfig { .circuit_breakers .relative_sys_load .metric, + page_cache_reclaim_percentage: conf.global.tunning.page_cache_reclaim_percentage, }, synchronizer: SynchronizerConfig { sync_interval: conf.global.communication.proactive_request_interval, @@ -3924,6 +3926,13 @@ impl ConfigHandler { tunning.resource_monitoring_interval = new_tunning.resource_monitoring_interval; restart_agent = !first_run; } + if tunning.page_cache_reclaim_percentage != new_tunning.page_cache_reclaim_percentage { + info!( + "Update global.tunning.page_cache_reclaim_percentage from {:?} to {:?}.", + tunning.page_cache_reclaim_percentage, new_tunning.page_cache_reclaim_percentage + ); + tunning.page_cache_reclaim_percentage = new_tunning.page_cache_reclaim_percentage; + } // dev let dev = &mut config.dev; diff --git a/agent/src/monitor.rs b/agent/src/monitor.rs index 95dde7a9cc4..2209e39771d 100644 --- a/agent/src/monitor.rs +++ b/agent/src/monitor.rs @@ -35,6 +35,7 @@ use crate::{ config::handler::EnvironmentAccess, error::{Error, Result}, utils::{ + cgroups, process::{get_current_sys_memory_percentage, get_file_and_size_sum}, stats::{ self, Collector, Countable, Counter, CounterType, CounterValue, RefCountable, @@ -318,6 +319,16 @@ impl RefCountable for SysStatusBroker { Err(_) => CounterValue::Unsigned(0), }, )); + #[cfg(target_os = "linux")] + metrics.push(( + "page_cache", + CounterType::Gauged, + if let Some(m_stat) = cgroups::memory_info() { + CounterValue::Unsigned(m_stat.stat.cache) + } else { + CounterValue::Unsigned(0) + }, + )); metrics } } diff --git a/agent/src/utils/cgroups/linux.rs b/agent/src/utils/cgroups/linux.rs index f2daa7585c6..277ec342ff3 100644 --- a/agent/src/utils/cgroups/linux.rs +++ b/agent/src/utils/cgroups/linux.rs @@ -14,6 +14,7 @@ * limitations under the License. */ +use std::path::{Path, PathBuf}; use std::sync::{Arc, Condvar, Mutex}; use std::thread::JoinHandle; use std::time::Duration; @@ -24,9 +25,14 @@ use crate::config::handler::EnvironmentAccess; use crate::utils::environment::is_kernel_available; use arc_swap::access::Access; -use cgroups_rs::cgroup_builder::*; -use cgroups_rs::*; -use log::{info, warn}; +use cgroups_rs::{ + cgroup_builder::CgroupBuilder, + cpu::CpuController, + hierarchies, + memory::{MemController, Memory}, + Cgroup, CgroupPid, Controller, CpuResources, MemoryResources, Resources, +}; +use log::{debug, info, trace, warn}; use public::consts::{DEFAULT_CPU_CFS_PERIOD_US, PROCESS_NAME}; pub struct Cgroups { @@ -40,32 +46,26 @@ pub struct Cgroups { const CHECK_INTERVAL: Duration = Duration::from_secs(1); +fn cgroups_supported() -> bool { + let Ok(fs) = fs::read_to_string("/proc/filesystems") else { + return false; + }; + fs.lines() + .any(|line| line.to_lowercase().contains("cgroup")) +} + impl Cgroups { /// 创建cgroup hierarchy pub fn new(pid: u64, config: EnvironmentAccess) -> Result { - let contents = match fs::read_to_string("/proc/filesystems") { - Ok(file_contents) => file_contents, - Err(e) => { - return Err(Error::CgroupsNotSupported(e.to_string())); - } - }; - let mut cgroup_supported = false; - for line in contents.lines() { - // 检查系统是否支持cgroup - if line.to_lowercase().contains("cgroup") { - cgroup_supported = true; - break; - } - } - if !cgroup_supported { - return Err(Error::CgroupsNotSupported(format!( - "cgroups v1 or v2 is not found." - ))); + if !cgroups_supported() { + return Err(Error::CgroupsNotSupported( + "read /proc/filesystems failed or cgroups/cgroups2 not found.".to_string(), + )); } let hier = hierarchies::auto(); let is_v2 = hier.v2(); let cg: Cgroup = CgroupBuilder::new(PROCESS_NAME).build(hier); - let cpus: &cpu::CpuController = match cg.controller_of() { + let cpus: &CpuController = match cg.controller_of() { Some(controller) => controller, None => { return Err(Error::CpuControllerSetFailed(format!( @@ -73,7 +73,7 @@ impl Cgroups { ))); } }; - let mem: &memory::MemController = match cg.controller_of() { + let mem: &MemController = match cg.controller_of() { Some(controller) => controller, None => { return Err(Error::MemControllerSetFailed(format!( @@ -229,3 +229,135 @@ pub fn is_cgroup_procs_writable() -> bool { const MIN_KERNEL_VERSION_CGROUP_PROCS: &str = "3"; is_kernel_available(MIN_KERNEL_VERSION_CGROUP_PROCS) } + +const PID1_ROOT: &str = "/proc/1/root"; + +/* + * The path of container memory cgroup from its own namespace is `/sys/fs/cgroup/memory`. + * However, the path is mounted read-only, making it impossible to reclaim memory cache with: + * + * `echo 0 > /sys/fs/cgroup/memory/memory.force_empty` + * + * The approach here is to take a little detour, visiting the actual memory cgroup path from global cgroup namespace. + * The path would be: + * + * /proc/1/root/{memory_cgroup_mount_point}/{cgroup_path}/memory.force_empty + */ +fn get_memory_cgroup_path() -> procfs::ProcResult> { + let mut path = PathBuf::from(PID1_ROOT); + let proc = procfs::process::Process::myself()?; + + let Some(mount_info) = proc + .mountinfo()? + .into_iter() + .find(|m| m.fs_type == "cgroup" && m.super_options.contains_key("memory")) + else { + debug!("memory cgroup not found"); + return Ok(None); + }; + trace!( + "memory cgroup mount point: {}", + mount_info.mount_point.display() + ); + let mut mount_point = mount_info.mount_point.components(); + mount_point.next(); // skip "/" + path.extend(mount_point); + + let Some(cg_info) = proc + .cgroups()? + .into_iter() + .find(|cg| cg.controllers.iter().any(|c| c == "memory")) + else { + return Ok(None); + }; + trace!("memory cgroup path: {}", cg_info.pathname); + if cg_info.pathname == "/" { + debug!("memory cgroup is mounted on root"); + return Ok(None); + } + let mut cg_path = Path::new(&cg_info.pathname).components(); + cg_path.next(); // skip "/" + path.extend(cg_path); + + trace!("memory cgroup path: {}", path.display()); + Ok(Some(path)) +} + +fn cgroups_v1_check() -> bool { + if !cgroups_supported() { + debug!("cgroups not supported for this system"); + return false; + } + if hierarchies::is_cgroup2_unified_mode() { + debug!("cgroups v2 is not supported"); + return false; + } + + true +} + +pub(crate) fn memory_info() -> Option { + if !cgroups_v1_check() { + return None; + } + + let mem_mount = match get_memory_cgroup_path() { + Ok(Some(path)) => path, + Ok(None) => { + debug!("cgroups memory mount point not found or is invalid"); + return None; + } + Err(e) => { + warn!("get memory path failed: {e}"); + return None; + } + }; + + Some(MemController::new(mem_mount.clone(), false).memory_stat()) +} + +pub(crate) fn page_cache_reclaim_check(threshold: u8) -> bool { + if threshold >= 100 { + return false; + } + if !cgroups_v1_check() { + return false; + } + + let mem_mount = match get_memory_cgroup_path() { + Ok(Some(path)) => path, + Ok(None) => { + debug!("cgroups memory mount point not found or is invalid"); + return false; + } + Err(e) => { + warn!("get memory path failed: {e}"); + return false; + } + }; + let mc = MemController::new(mem_mount.clone(), false); + let mut reclaim_path = mem_mount; + reclaim_path.set_file_name("memory.force_empty"); + + let m_stat = mc.memory_stat(); + let percentage = m_stat.stat.cache * 100 / m_stat.limit_in_bytes as u64; + if percentage < threshold as u64 { + debug!("cache / limit = {percentage}% < {threshold}%"); + return false; + } + + debug!("cache before reclaim: {}", m_stat.stat.cache); + if let Err(e) = fs::write(&reclaim_path, b"0") { + warn!( + "reclaim memory cache write to {} failed: {e}", + reclaim_path.display() + ); + return false; + } + info!( + "page cache reclaimed {} -> {}", + m_stat.stat.cache, + mc.memory_stat().stat.cache + ); + true +} diff --git a/agent/src/utils/guard.rs b/agent/src/utils/guard.rs index 4139ca01cca..09f995840fe 100644 --- a/agent/src/utils/guard.rs +++ b/agent/src/utils/guard.rs @@ -355,6 +355,8 @@ impl Guard { let in_container = running_in_container(); let cgroups_disabled = self.cgroups_disabled; let mut last_exceeded = get_timestamp(0); + #[cfg(target_os = "linux")] + let mut last_page_reclaim = Instant::now(); let thread = thread::Builder::new().name("guard".to_owned()).spawn(move || { let mut system_load = SystemLoadGuard::new(system.clone(), exception_handler.clone()); @@ -436,6 +438,12 @@ impl Guard { unsafe { let _ = malloc_trim(0); } } + #[cfg(target_os = "linux")] + if last_page_reclaim.elapsed() >= Duration::from_secs(60) { + last_page_reclaim = Instant::now(); + let _ = crate::utils::cgroups::page_cache_reclaim_check(config.page_cache_reclaim_percentage); + } + // Periodic memory checks are necessary: // Cgroups does not count the memory of RssFile, and AF_PACKET Block occupies RssFile. // Therefore, using Cgroups to limit the memory usage may not be accurate in some scenarios. diff --git a/server/agent_config/README-CH.md b/server/agent_config/README-CH.md index 5231554d21b..24f421ad1fd 100644 --- a/server/agent_config/README-CH.md +++ b/server/agent_config/README-CH.md @@ -652,6 +652,42 @@ global: 开启闲置内存修剪特性,将降低 agent 内存使用量,但可能会损失 agent 处理性能。 +### Page Cache 回收间隔 {#global.tunning.page_cache_reclaim_interval} + +**标签**: + +`hot_update` + +**FQCN**: + +`global.tunning.page_cache_reclaim_interval` + +Upgrade from old version: `static_config.page-cache-reclaim-interval` + +**默认值**: +```yaml +global: + tunning: + page_cache_reclaim_interval: 0 +``` + +**模式**: +| Key | Value | +| ---- | ---------------------------- | +| Type | duration | +| Range | [0, '1d'] | + +**详细描述**: + +Cgroup 的内存使用量包括匿名内存和文件页缓存。在某些情况下,仅仅是文件页缓存就可能导致 +cgroup 因为内存不足杀死 agent 进程。为了避免这种情况,agent 将定期强制清空文件页缓存, +且由于 agent 的文件 I/O 量不大,这不太可能对 agent 的性能造成影响,但同一 cgroup 下的其他 +进程可能会受到影响。不建议设置很小的值。 +设置该值为 0 时,该特性不生效。 +注意: +- 该特性仅支持 cgroups v1。 +- 如果 agent 的 memory cgroup 路径是 “/”,该特性不生效。 + ### 资源监控间隔 {#global.tunning.resource_monitoring_interval} **标签**: @@ -6928,7 +6964,7 @@ processors: | Key | Value | | ---- | ---------------------------- | | Type | duration | -| Range | ['1s', '10s'] | +| Range | ['1s', '20s'] | **详细描述**: @@ -6959,7 +6995,7 @@ processors: | Key | Value | | ---- | ---------------------------- | | Type | duration | -| Range | ['0s', '10s'] | +| Range | ['0s', '20s'] | **详细描述**: diff --git a/server/agent_config/README.md b/server/agent_config/README.md index 3008dcfa815..ee300ffda05 100644 --- a/server/agent_config/README.md +++ b/server/agent_config/README.md @@ -664,6 +664,43 @@ global: Proactive memory trimming can effectively reduce memory usage, but there may be performance loss. +### Page Cache Reclaim Interval {#global.tunning.page_cache_reclaim_interval} + +**Tags**: + +`hot_update` + +**FQCN**: + +`global.tunning.page_cache_reclaim_interval` + +Upgrade from old version: `static_config.page-cache-reclaim-interval` + +**Default value**: +```yaml +global: + tunning: + page_cache_reclaim_interval: 0 +``` + +**Schema**: +| Key | Value | +| ---- | ---------------------------- | +| Type | duration | +| Range | [0, '1d'] | + +**Description**: + +Both anonymous memory and file page cache are accounted for in cgroup's memory usage. +Under some circumstances, page cache alone can cause cgroup to OOM kill agent process. +To avoid this, agent can reclaim page cache periodically. Although reclaming may not +cause performance issues for agent who doesn't have much I/O, other processes in +the same cgroup may be affected. Very low values are not recommended. +Setting this value to 0 disables this feature. +Note: +- This feature is available for cgroups v1 only. +- This feature is disabled if agent memory cgroup path is "/". + ### Resource Monitoring Interval {#global.tunning.resource_monitoring_interval} **Tags**: @@ -7134,7 +7171,7 @@ processors: | Key | Value | | ---- | ---------------------------- | | Type | duration | -| Range | ['1s', '10s'] | +| Range | ['1s', '20s'] | **Description**: @@ -7167,7 +7204,7 @@ processors: | Key | Value | | ---- | ---------------------------- | | Type | duration | -| Range | ['0s', '10s'] | +| Range | ['0s', '20s'] | **Description**: diff --git a/server/agent_config/config.go b/server/agent_config/config.go index a89e3ea08e3..8c5cedc7fc5 100644 --- a/server/agent_config/config.go +++ b/server/agent_config/config.go @@ -196,6 +196,7 @@ type StaticConfig struct { CheckCoreFileDisabled *bool `yaml:"check-core-file-disabled,omitempty"` SoPlugins []string `yaml:"so-plugins,omitempty"` MemoryTrimDisabled *bool `yaml:"memory-trim-disabled,omitempty"` + PageCacheReclaimPercentage *int `yaml:"page-cache-reclaim-percentage,omitempty"` FastPathDisabled *bool `yaml:"fast-path-disabled,omitempty"` ForwardCapacity *uint32 `yaml:"forward-capacity,omitempty"` RrtTcpTimeout *string `yaml:"rrt-tcp-timeout,omitempty"` diff --git a/server/agent_config/example.yaml b/server/agent_config/example.yaml index 96eece5e40e..161b415d67d 100644 --- a/server/agent_config/example.yaml +++ b/server/agent_config/example.yaml @@ -1817,12 +1817,24 @@ static_config: ## will cause agent to restart immediately. #max-sockets-tolerate-interval: 60s - ################# - ## Memory trim ## - ################# - # Note: - # Using memory trimming can effectively reduce memory usage, but there may be performance loss - memory-trim-disabled: false + #################################### + ## Page Cache Reclaim Percentage ## + #################################### + ## The percentage threshold of page cache to cgroup memory limit that triggers reclaim. + ## Default: 100. Unit: percentage. Range: [0, 100]. + ## Note: + ## A page cache reclaim is triggered when the percentage of page cache and + ## cgroups memory.limit_in_bytes exceeds this value. + ## Both anonymous memory and file page cache are accounted for in cgroup's memory usage. + ## Under some circumstances, page cache alone can cause cgroup to OOM kill agent process. + ## To avoid this, agent can reclaim page cache periodically. Although reclaming may not + ## cause performance issues for agent who doesn't have much I/O, other processes in + ## the same cgroup may be affected. Very low values are not recommended. + ## Note: + ## - This feature is available for cgroups v1 only. + ## - This feature is disabled if agent memory cgroup path is "/". + ## - The minimal interval of reclaims is 1 minute. + #page-cache-reclaim-percentage: 100 ##################### ## Check core file ## diff --git a/server/agent_config/template.yaml b/server/agent_config/template.yaml index ab2fd4cba97..d1fd5e40ab9 100644 --- a/server/agent_config/template.yaml +++ b/server/agent_config/template.yaml @@ -443,6 +443,40 @@ global: # 开启闲置内存修剪特性,将降低 agent 内存使用量,但可能会损失 agent 处理性能。 # upgrade_from: static_config.memory-trim-disabled idle_memory_trimming: true + # type: int + # name: + # en: Page Cache Reclaim Percentage + # ch: Page Cache 回收百分比 + # unit: + # range: [0, 100] + # enum_options: [] + # modification: hot_update + # ee_feature: false + # description: + # en: |- + # A page cache reclaim is triggered when the pecentage of page cache and + # cgroups memory.limit_in_bytes exceeds this value. + # Both anonymous memory and file page cache are accounted for in cgroup's memory usage. + # Under some circumstances, page cache alone can cause cgroup to OOM kill agent process. + # To avoid this, agent can reclaim page cache periodically. Although reclaming may not + # cause performance issues for agent who doesn't have much I/O, other processes in + # the same cgroup may be affected. Very low values are not recommended. + # Note: + # - This feature is available for cgroups v1 only. + # - This feature is disabled if agent memory cgroup path is "/". + # - The minimal interval of reclaims is 1 minute. + # ch: |- + # 当文件页缓存和 cgroup 内存限制的百分比超过此阈值时,agent 将清空文件页缓存。 + # Cgroup 的内存使用量包括匿名内存和文件页缓存。在某些情况下,仅仅是文件页缓存就可能导致 + # cgroup 因为内存不足杀死 agent 进程。为了避免这种情况,agent 将定期强制清空文件页缓存, + # 且由于 agent 的文件 I/O 量不大,这不太可能对 agent 的性能造成影响,但同一 cgroup 下的其他 + # 进程可能会受到影响。不建议设置很小的值。 + # 注意: + # - 该特性仅支持 cgroups v1。 + # - 如果 agent 的 memory cgroup 路径是 “/”,该特性不生效。 + # - 回收的最小间隔是 1 分钟。 + # upgrade_from: static_config.page-cache-reclaim-percentage + page_cache_reclaim_percentage: 100 # type: duration # name: # en: Resource Monitoring Interval @@ -4846,7 +4880,7 @@ processors: # en: Maximum Tolerable Packet Delay # ch: 最大可容忍的 Packet 延迟 # unit: - # range: [1s, 10s] + # range: [1s, 20s] # enum_options: [] # modification: agent_restart # ee_feature: false @@ -4866,7 +4900,7 @@ processors: # en: Extra Tolerable Flow Delay # ch: 额外可容忍的 Flow 延迟 # unit: - # range: [0s, 10s] + # range: [0s, 20s] # enum_options: [] # modification: agent_restart # ee_feature: false