Skip to content

Commit

Permalink
Merge pull request #38 from Starttoaster/client-banning
Browse files Browse the repository at this point in the history
Implement temp client banning on failed requests
  • Loading branch information
Starttoaster authored May 7, 2024
2 parents beeb100 + b1a2a32 commit de5af64
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 20 deletions.
1 change: 1 addition & 0 deletions internal/prometheus/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ type collectNodeResponse struct {

func (c *Collector) collectNode(ch chan<- prometheus.Metric, node proxmox.GetNodesData, resultChan chan<- collectNodeResponse, wg *sync.WaitGroup) {
defer wg.Done()
defer logger.Logger.Debug("finished requests for node data", "node", node.Node)
var resp collectNodeResponse

// Collect metrics that just need node data
Expand Down
59 changes: 55 additions & 4 deletions internal/proxmox/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,17 @@ var (
// ClusterName gets populated with the proxmox cluster's cluster name on clustered PVE instances
ClusterName string

clients map[string]*proxmox.Client
cash *cache.Cache
clients map[string]wrappedClient
banDuration = time.Duration(1 * time.Minute)
cash *cache.Cache
)

type wrappedClient struct {
client *proxmox.Client
banned bool
bannedUntil time.Time
}

// Init constructs a proxmox API client for this package taking in a token
func Init(endpoints []string, tokenID, token string, tlsVerify bool) error {
// Fail early if endpoints slice is 0 length
Expand All @@ -39,7 +46,7 @@ func Init(endpoints []string, tokenID, token string, tlsVerify bool) error {
}

// Make and init proxmox client map
clients = make(map[string]*proxmox.Client)
clients = make(map[string]wrappedClient)
for _, endpoint := range endpoints {
// Parse URL for hostname
parsedURL, err := url.Parse(endpoint)
Expand All @@ -59,14 +66,19 @@ func Init(endpoints []string, tokenID, token string, tlsVerify bool) error {
}

// Add client to map
clients[hostname] = c
clients[hostname] = wrappedClient{
client: c,
}
}

// init cache -- at longest, cache will live for 29 seconds
// which should ensure metrics are updated if scraping in 30 second intervals
// TODO should cache lifespan and cache expiration intervals be user configurable?
cash = cache.New(24*time.Second, 5*time.Second)

// Maintain client bans
go refreshClientBans()

retrieveClusterName()

return nil
Expand Down Expand Up @@ -95,3 +107,42 @@ func retrieveClusterName() {
log.Logger.Info("discovered PVE cluster", "cluster", ClusterName)
}
}

// refreshClientBans iterates over the configured clients and checks if their ban is still valid over time
func refreshClientBans() {
for {
// Loop through clients from client map
for name, c := range clients {
// Check if the client is banned -- if banned we need to check if the client's banUntil time has expired
if c.banned && time.Now().After(c.bannedUntil) {
// If the ban expired, make a request, see if it succeeds, and unban it if successful. Increase the ban timer if not
_, _, err := c.client.Nodes.GetNodes()
if err == nil {
// Unban client - request successful
log.Logger.Debug("unbanning client, test request successful", "name", name)
clients[name] = wrappedClient{
client: c.client,
}
continue
} else {
// Re-up ban timer - request failed
log.Logger.Debug("re-upping ban on client, test request failed", "name", name, "error", err)
banClient(name, c)
continue
}
}
}

time.Sleep(5 * time.Second)
}
}

// banClient bans a client for the defined duration
func banClient(name string, c wrappedClient) {
log.Logger.Debug("banning client", "name", name, "duration", banDuration)
clients[name] = wrappedClient{
client: c.client,
banned: true,
bannedUntil: time.Now().Add(banDuration),
}
}
17 changes: 15 additions & 2 deletions internal/proxmox/cluster.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package proxmox

import (
"fmt"

"github.com/patrickmn/go-cache"
proxmox "github.com/starttoaster/go-proxmox"
log "github.com/starttoaster/proxmox-exporter/internal/logger"
Expand All @@ -21,16 +23,27 @@ func GetClusterStatus() (*proxmox.GetClusterStatusResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
cluster, _, err = c.Cluster.GetClusterStatus()
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

cluster, _, err = c.client.Cluster.GetClusterStatus()
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if cluster == nil {
return nil, fmt.Errorf("request to get cluster status was not successful. It's possible all clients are banned")
}

// Update cache
cash.Set("GetClusterStatus", cluster, cache.NoExpiration)

Expand Down
105 changes: 91 additions & 14 deletions internal/proxmox/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,27 @@ func GetNodes() (*proxmox.GetNodesResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
nodes, _, err = c.Nodes.GetNodes()
for name, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

nodes, _, err = c.client.Nodes.GetNodes()
if err == nil {
break
} else {
banClient(name, c)
}
}
if err != nil {
return nil, err
}

if nodes == nil {
return nil, fmt.Errorf("request to get node list was not successful. It's possible all clients are banned")
}

// Update cache
cash.Set("GetNodes", nodes, cache.DefaultExpiration)

Expand All @@ -54,16 +65,27 @@ func GetNodeStatus(name string) (*proxmox.GetNodeStatusResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
node, _, err = c.Nodes.GetNodeStatus(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

node, _, err = c.client.Nodes.GetNodeStatus(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if node == nil {
return nil, fmt.Errorf("request to get node status was not successful. It's possible all clients are banned")
}

// Update cache
cash.Set(fmt.Sprintf("GetNodeStatus_%s", name), node, cache.DefaultExpiration)

Expand All @@ -85,16 +107,27 @@ func GetNodeQemu(name string) (*proxmox.GetNodeQemuResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
vms, _, err = c.Nodes.GetNodeQemu(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

vms, _, err = c.client.Nodes.GetNodeQemu(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if vms == nil {
return nil, fmt.Errorf("request to get node VMs was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeQemu_%s", name), vms, cache.DefaultExpiration)

Expand All @@ -116,16 +149,27 @@ func GetNodeLxc(name string) (*proxmox.GetNodeLxcResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
lxcs, _, err = c.Nodes.GetNodeLxc(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

lxcs, _, err = c.client.Nodes.GetNodeLxc(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if lxcs == nil {
return nil, fmt.Errorf("request to get node LXCs was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeLxc_%s", name), lxcs, cache.DefaultExpiration)

Expand All @@ -147,16 +191,27 @@ func GetNodeDisksList(name string) (*proxmox.GetNodeDisksListResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
disks, _, err = c.Nodes.GetNodeDisksList(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

disks, _, err = c.client.Nodes.GetNodeDisksList(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if disks == nil {
return nil, fmt.Errorf("request to get node disks was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeDisksList_%s", name), disks, cache.DefaultExpiration)

Expand All @@ -178,16 +233,27 @@ func GetNodeCertificatesInfo(name string) (*proxmox.GetNodeCertificatesInfoRespo

// Make request if not found in cache
var err error
for _, c := range clients {
certs, _, err = c.Nodes.GetNodeCertificatesInfo(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

certs, _, err = c.client.Nodes.GetNodeCertificatesInfo(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if certs == nil {
return nil, fmt.Errorf("request to get node certificates was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeCertificatesInfo_%s", name), certs, cache.DefaultExpiration)

Expand All @@ -209,16 +275,27 @@ func GetNodeStorage(name string) (*proxmox.GetNodeStorageResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
store, _, err = c.Nodes.GetNodeStorage(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

store, _, err = c.client.Nodes.GetNodeStorage(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if store == nil {
return nil, fmt.Errorf("request to get node storage was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeStorage_%s", name), store, cache.DefaultExpiration)

Expand Down

0 comments on commit de5af64

Please sign in to comment.