diff --git a/internal/prometheus/node.go b/internal/prometheus/node.go index dd1d1fc..8a70dff 100644 --- a/internal/prometheus/node.go +++ b/internal/prometheus/node.go @@ -21,6 +21,7 @@ type collectNodeResponse struct { func (c *Collector) collectNode(ch chan<- prometheus.Metric, node proxmox.GetNodesData, resultChan chan<- collectNodeResponse, wg *sync.WaitGroup) { defer wg.Done() + defer logger.Logger.Debug("finished requests for node data", "node", node.Node) var resp collectNodeResponse // Collect metrics that just need node data diff --git a/internal/proxmox/client.go b/internal/proxmox/client.go index 7fe0207..c192c13 100755 --- a/internal/proxmox/client.go +++ b/internal/proxmox/client.go @@ -18,10 +18,17 @@ var ( // ClusterName gets populated with the proxmox cluster's cluster name on clustered PVE instances ClusterName string - clients map[string]*proxmox.Client - cash *cache.Cache + clients map[string]wrappedClient + banDuration = time.Duration(1 * time.Minute) + cash *cache.Cache ) +type wrappedClient struct { + client *proxmox.Client + banned bool + bannedUntil time.Time +} + // Init constructs a proxmox API client for this package taking in a token func Init(endpoints []string, tokenID, token string, tlsVerify bool) error { // Fail early if endpoints slice is 0 length @@ -39,7 +46,7 @@ func Init(endpoints []string, tokenID, token string, tlsVerify bool) error { } // Make and init proxmox client map - clients = make(map[string]*proxmox.Client) + clients = make(map[string]wrappedClient) for _, endpoint := range endpoints { // Parse URL for hostname parsedURL, err := url.Parse(endpoint) @@ -59,7 +66,9 @@ func Init(endpoints []string, tokenID, token string, tlsVerify bool) error { } // Add client to map - clients[hostname] = c + clients[hostname] = wrappedClient{ + client: c, + } } // init cache -- at longest, cache will live for 29 seconds @@ -67,6 +76,9 @@ func Init(endpoints []string, tokenID, token string, tlsVerify bool) error { // TODO should cache lifespan and cache expiration intervals be user configurable? cash = cache.New(24*time.Second, 5*time.Second) + // Maintain client bans + go refreshClientBans() + retrieveClusterName() return nil @@ -95,3 +107,42 @@ func retrieveClusterName() { log.Logger.Info("discovered PVE cluster", "cluster", ClusterName) } } + +// refreshClientBans iterates over the configured clients and checks if their ban is still valid over time +func refreshClientBans() { + for { + // Loop through clients from client map + for name, c := range clients { + // Check if the client is banned -- if banned we need to check if the client's banUntil time has expired + if c.banned && time.Now().After(c.bannedUntil) { + // If the ban expired, make a request, see if it succeeds, and unban it if successful. Increase the ban timer if not + _, _, err := c.client.Nodes.GetNodes() + if err == nil { + // Unban client - request successful + log.Logger.Debug("unbanning client, test request successful", "name", name) + clients[name] = wrappedClient{ + client: c.client, + } + continue + } else { + // Re-up ban timer - request failed + log.Logger.Debug("re-upping ban on client, test request failed", "name", name, "error", err) + banClient(name, c) + continue + } + } + } + + time.Sleep(5 * time.Second) + } +} + +// banClient bans a client for the defined duration +func banClient(name string, c wrappedClient) { + log.Logger.Debug("banning client", "name", name, "duration", banDuration) + clients[name] = wrappedClient{ + client: c.client, + banned: true, + bannedUntil: time.Now().Add(banDuration), + } +} diff --git a/internal/proxmox/cluster.go b/internal/proxmox/cluster.go index 6a15a32..5505b0c 100644 --- a/internal/proxmox/cluster.go +++ b/internal/proxmox/cluster.go @@ -1,6 +1,8 @@ package proxmox import ( + "fmt" + "github.com/patrickmn/go-cache" proxmox "github.com/starttoaster/go-proxmox" log "github.com/starttoaster/proxmox-exporter/internal/logger" @@ -21,16 +23,27 @@ func GetClusterStatus() (*proxmox.GetClusterStatusResponse, error) { // Make request if not found in cache var err error - for _, c := range clients { - cluster, _, err = c.Cluster.GetClusterStatus() + for clientName, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + cluster, _, err = c.client.Cluster.GetClusterStatus() if err == nil { break + } else { + banClient(clientName, c) } } if err != nil { return nil, err } + if cluster == nil { + return nil, fmt.Errorf("request to get cluster status was not successful. It's possible all clients are banned") + } + // Update cache cash.Set("GetClusterStatus", cluster, cache.NoExpiration) diff --git a/internal/proxmox/nodes.go b/internal/proxmox/nodes.go index b263f90..ce4849d 100755 --- a/internal/proxmox/nodes.go +++ b/internal/proxmox/nodes.go @@ -23,16 +23,27 @@ func GetNodes() (*proxmox.GetNodesResponse, error) { // Make request if not found in cache var err error - for _, c := range clients { - nodes, _, err = c.Nodes.GetNodes() + for name, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + nodes, _, err = c.client.Nodes.GetNodes() if err == nil { break + } else { + banClient(name, c) } } if err != nil { return nil, err } + if nodes == nil { + return nil, fmt.Errorf("request to get node list was not successful. It's possible all clients are banned") + } + // Update cache cash.Set("GetNodes", nodes, cache.DefaultExpiration) @@ -54,16 +65,27 @@ func GetNodeStatus(name string) (*proxmox.GetNodeStatusResponse, error) { // Make request if not found in cache var err error - for _, c := range clients { - node, _, err = c.Nodes.GetNodeStatus(name) + for clientName, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + node, _, err = c.client.Nodes.GetNodeStatus(name) if err == nil { break + } else { + banClient(clientName, c) } } if err != nil { return nil, err } + if node == nil { + return nil, fmt.Errorf("request to get node status was not successful. It's possible all clients are banned") + } + // Update cache cash.Set(fmt.Sprintf("GetNodeStatus_%s", name), node, cache.DefaultExpiration) @@ -85,16 +107,27 @@ func GetNodeQemu(name string) (*proxmox.GetNodeQemuResponse, error) { // Make request if not found in cache var err error - for _, c := range clients { - vms, _, err = c.Nodes.GetNodeQemu(name) + for clientName, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + vms, _, err = c.client.Nodes.GetNodeQemu(name) if err == nil { break + } else { + banClient(clientName, c) } } if err != nil { return nil, err } + if vms == nil { + return nil, fmt.Errorf("request to get node VMs was not successful. It's possible all clients are banned") + } + // Update per-node cache since we have it cash.Set(fmt.Sprintf("GetNodeQemu_%s", name), vms, cache.DefaultExpiration) @@ -116,16 +149,27 @@ func GetNodeLxc(name string) (*proxmox.GetNodeLxcResponse, error) { // Make request if not found in cache var err error - for _, c := range clients { - lxcs, _, err = c.Nodes.GetNodeLxc(name) + for clientName, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + lxcs, _, err = c.client.Nodes.GetNodeLxc(name) if err == nil { break + } else { + banClient(clientName, c) } } if err != nil { return nil, err } + if lxcs == nil { + return nil, fmt.Errorf("request to get node LXCs was not successful. It's possible all clients are banned") + } + // Update per-node cache since we have it cash.Set(fmt.Sprintf("GetNodeLxc_%s", name), lxcs, cache.DefaultExpiration) @@ -147,16 +191,27 @@ func GetNodeDisksList(name string) (*proxmox.GetNodeDisksListResponse, error) { // Make request if not found in cache var err error - for _, c := range clients { - disks, _, err = c.Nodes.GetNodeDisksList(name) + for clientName, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + disks, _, err = c.client.Nodes.GetNodeDisksList(name) if err == nil { break + } else { + banClient(clientName, c) } } if err != nil { return nil, err } + if disks == nil { + return nil, fmt.Errorf("request to get node disks was not successful. It's possible all clients are banned") + } + // Update per-node cache since we have it cash.Set(fmt.Sprintf("GetNodeDisksList_%s", name), disks, cache.DefaultExpiration) @@ -178,16 +233,27 @@ func GetNodeCertificatesInfo(name string) (*proxmox.GetNodeCertificatesInfoRespo // Make request if not found in cache var err error - for _, c := range clients { - certs, _, err = c.Nodes.GetNodeCertificatesInfo(name) + for clientName, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + certs, _, err = c.client.Nodes.GetNodeCertificatesInfo(name) if err == nil { break + } else { + banClient(clientName, c) } } if err != nil { return nil, err } + if certs == nil { + return nil, fmt.Errorf("request to get node certificates was not successful. It's possible all clients are banned") + } + // Update per-node cache since we have it cash.Set(fmt.Sprintf("GetNodeCertificatesInfo_%s", name), certs, cache.DefaultExpiration) @@ -209,16 +275,27 @@ func GetNodeStorage(name string) (*proxmox.GetNodeStorageResponse, error) { // Make request if not found in cache var err error - for _, c := range clients { - store, _, err = c.Nodes.GetNodeStorage(name) + for clientName, c := range clients { + // Check if client was banned, skip if is + if c.banned { + continue + } + + store, _, err = c.client.Nodes.GetNodeStorage(name) if err == nil { break + } else { + banClient(clientName, c) } } if err != nil { return nil, err } + if store == nil { + return nil, fmt.Errorf("request to get node storage was not successful. It's possible all clients are banned") + } + // Update per-node cache since we have it cash.Set(fmt.Sprintf("GetNodeStorage_%s", name), store, cache.DefaultExpiration)