diff --git a/cats/cats-sql/src/main/kotlin/com/netflix/spinnaker/cats/sql/cluster/SqlClusteredAgentScheduler.kt b/cats/cats-sql/src/main/kotlin/com/netflix/spinnaker/cats/sql/cluster/SqlClusteredAgentScheduler.kt index 8c125424901..4809f5b4f84 100644 --- a/cats/cats-sql/src/main/kotlin/com/netflix/spinnaker/cats/sql/cluster/SqlClusteredAgentScheduler.kt +++ b/cats/cats-sql/src/main/kotlin/com/netflix/spinnaker/cats/sql/cluster/SqlClusteredAgentScheduler.kt @@ -153,6 +153,7 @@ class SqlClusteredAgentScheduler( } private fun findCandidateAgentLocks(): Map { + cleanupZombieAgents() val skip = HashMap(activeAgents).entries val maxConcurrentAgents = dynamicConfigService.getConfig(Int::class.java, "sql.agent.max-concurrent-agents", 100) val availableAgents = maxConcurrentAgents - skip.size @@ -233,6 +234,16 @@ class SqlClusteredAgentScheduler( return trimmedCandidates } + private fun cleanupZombieAgents() { + val zombieAgentThreshold = dynamicConfigService.getConfig(Long::class.java, "sql.agent.zombie-threshold-ms", 3600000) + activeAgents + .filter { it.value.currentTime < System.currentTimeMillis() - zombieAgentThreshold } + .forEach { + log.warn("Found zombie agent {}, removing it", it.key) + activeAgents.remove(it.key, it.value) + } + } + private fun tryAcquireSingle(agentType: String, now: Long, timeout: Long): Boolean { try { withPool(POOL_NAME) {