diff --git a/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml b/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml new file mode 100644 index 000000000..efd760113 --- /dev/null +++ b/bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml @@ -0,0 +1,38 @@ +--- +{{ if and .IsPrometheusOperatorInstalled .PrometheusOperatorDeployRules }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: sriov-vf-rules + namespace: {{.Namespace}} +spec: + groups: + - name: sriov-network-metrics-operator.rules + interval: 30s + rules: + - expr: | + sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_tx_packets + - expr: | + sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_packets + - expr: | + sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_tx_bytes + - expr: | + sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_bytes + - expr: | + sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_tx_dropped + - expr: | + sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_dropped + - expr: | + sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_broadcast + - expr: | + sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice + record: network:sriov_vf_rx_multicast +{{ end }} + diff --git a/controllers/sriovoperatorconfig_controller.go b/controllers/sriovoperatorconfig_controller.go index 8d028d8eb..1121b623f 100644 --- a/controllers/sriovoperatorconfig_controller.go +++ b/controllers/sriovoperatorconfig_controller.go @@ -241,6 +241,7 @@ func (r *SriovOperatorConfigReconciler) syncMetricsExporter(ctx context.Context, data.Data["IsOpenshift"] = r.PlatformHelper.IsOpenshiftCluster() data.Data["IsPrometheusOperatorInstalled"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED")) == trueString + data.Data["PrometheusOperatorDeployRules"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES")) == trueString data.Data["PrometheusOperatorServiceAccount"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT") data.Data["PrometheusOperatorNamespace"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE") diff --git a/controllers/sriovoperatorconfig_controller_test.go b/controllers/sriovoperatorconfig_controller_test.go index 9ba05490b..6a98925eb 100644 --- a/controllers/sriovoperatorconfig_controller_test.go +++ b/controllers/sriovoperatorconfig_controller_test.go @@ -380,6 +380,8 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() { It("should deploy extra configuration when the Prometheus operator is installed", func() { DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED")) os.Setenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", "true") + DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES")) + os.Setenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", "true") err := util.WaitForNamespacedObject(&rbacv1.Role{}, k8sClient, testNamespace, "prometheus-k8s", util.RetryInterval, util.APITimeout) Expect(err).ToNot(HaveOccurred()) @@ -394,6 +396,14 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() { Version: "v1", }, client.ObjectKey{Namespace: testNamespace, Name: "sriov-network-metrics-exporter"}) + + assertResourceExists( + schema.GroupVersionKind{ + Group: "monitoring.coreos.com", + Kind: "PrometheusRule", + Version: "v1", + }, + client.ObjectKey{Namespace: testNamespace, Name: "sriov-vf-rules"}) }) }) }) diff --git a/deploy/operator.yaml b/deploy/operator.yaml index b2aa302ab..e9fb25de3 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -78,6 +78,8 @@ spec: value: $METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE - name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED value: "$METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED" + - name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES + value: "$METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES" - name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT value: $METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT - name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE diff --git a/deploy/role.yaml b/deploy/role.yaml index a24f13729..0a6c27a21 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -29,9 +29,12 @@ rules: - monitoring.coreos.com resources: - servicemonitors + - prometheusrules verbs: - get - create + - update + - delete - apiGroups: - apps resourceNames: diff --git a/deployment/sriov-network-operator-chart/README.md b/deployment/sriov-network-operator-chart/README.md index 8875b36eb..a867613b2 100644 --- a/deployment/sriov-network-operator-chart/README.md +++ b/deployment/sriov-network-operator-chart/README.md @@ -89,6 +89,7 @@ We have introduced the following Chart parameters. | `operator.metricsExporter.prometheusOperator.enabled` | bool | false | Wheter the operator shoud configure Prometheus resources or not (e.g. `ServiceMonitors`). | | `operator.metricsExporter.prometheusOperator.serviceAccount` | string | `prometheus-k8s` | The service account used by the Prometheus Operator. This is used to give Prometheus the permission to list resource in the SR-IOV operator namespace | | `operator.metricsExporter.prometheusOperator.namespace` | string | `monitoring` | The namespace where the Prometheus Operator is installed. Setting this variable makes the operator deploy `monitoring.coreos.com` resources. | +| `operator.metricsExporter.prometheusOperator.deployRules` | bool | false | Whether the operator should deploy `PrometheusRules` to scrape namespace version of metrics. | #### Admission Controllers parameters diff --git a/deployment/sriov-network-operator-chart/templates/operator.yaml b/deployment/sriov-network-operator-chart/templates/operator.yaml index 12a9cc660..0e89d1959 100644 --- a/deployment/sriov-network-operator-chart/templates/operator.yaml +++ b/deployment/sriov-network-operator-chart/templates/operator.yaml @@ -83,6 +83,8 @@ spec: {{- if .Values.operator.metricsExporter.prometheusOperator.enabled }} - name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED value: {{ .Values.operator.metricsExporter.prometheusOperator.enabled | quote}} + - name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES + value: {{ .Values.operator.metricsExporter.prometheusOperator.deployRules | quote}} - name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT value: {{ .Values.operator.metricsExporter.prometheusOperator.serviceAccount }} - name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE diff --git a/deployment/sriov-network-operator-chart/templates/role.yaml b/deployment/sriov-network-operator-chart/templates/role.yaml index 29cf80cce..6551b5775 100644 --- a/deployment/sriov-network-operator-chart/templates/role.yaml +++ b/deployment/sriov-network-operator-chart/templates/role.yaml @@ -32,9 +32,12 @@ rules: - monitoring.coreos.com resources: - servicemonitors + - prometheusrules verbs: - get - create + - update + - delete - apiGroups: - apps resourceNames: diff --git a/deployment/sriov-network-operator-chart/values.yaml b/deployment/sriov-network-operator-chart/values.yaml index 3e73006b7..c70d6e323 100644 --- a/deployment/sriov-network-operator-chart/values.yaml +++ b/deployment/sriov-network-operator-chart/values.yaml @@ -35,6 +35,7 @@ operator: enabled: false serviceAccount: "prometheus-k8s" namespace: "monitoring" + deployRules: false admissionControllers: enabled: false certificates: diff --git a/hack/run-e2e-conformance-virtual-ocp.sh b/hack/run-e2e-conformance-virtual-ocp.sh index a61906fb2..0092fcdad 100755 --- a/hack/run-e2e-conformance-virtual-ocp.sh +++ b/hack/run-e2e-conformance-virtual-ocp.sh @@ -191,6 +191,7 @@ export DEV_MODE=TRUE export CLUSTER_HAS_EMULATED_PF=TRUE export OPERATOR_LEADER_ELECTION_ENABLE=true export METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED=true +export METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULE=true export METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT:-"prometheus-k8s"} export METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE:-"openshfit-monitoring"} diff --git a/test/conformance/tests/test_exporter_metrics.go b/test/conformance/tests/test_exporter_metrics.go index e81f63067..804432f04 100644 --- a/test/conformance/tests/test_exporter_metrics.go +++ b/test/conformance/tests/test_exporter_metrics.go @@ -2,9 +2,12 @@ package tests import ( "context" + "encoding/json" "fmt" + "net/url" "strings" + sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/discovery" "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces" @@ -13,6 +16,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -22,6 +26,8 @@ import ( ) var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { + var node string + var nic *sriovv1.InterfaceExt BeforeAll(func() { if cluster.VirtualCluster() { @@ -48,13 +54,11 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { Expect(err).ToNot(HaveOccurred()) WaitForSRIOVStable() - }) - It("collects metrics regarding receiving traffic via VF", func() { sriovInfos, err := cluster.DiscoverSriov(clients, operatorNamespace) Expect(err).ToNot(HaveOccurred()) - node, nic, err := sriovInfos.FindOneSriovNodeAndDevice() + node, nic, err = sriovInfos.FindOneSriovNodeAndDevice() Expect(err).ToNot(HaveOccurred()) By("Using device " + nic.Name + " on node " + node) @@ -65,7 +69,13 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { Expect(err).ToNot(HaveOccurred()) waitForNetAttachDef("test-me-network", namespaces.Test) + DeferCleanup(namespaces.Clean, operatorNamespace, namespaces.Test, clients, discovery.Enabled()) + }) + + It("collects metrics regarding receiving traffic via VF", func() { + pod := createTestPod(node, []string{"test-me-network"}) + DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) ips, err := network.GetSriovNicIPs(pod, "net1") Expect(err).ToNot(HaveOccurred()) @@ -88,6 +98,28 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() { Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets)) }) + It("PrometheusRule should provide namespaced metrics", func() { + pod := createTestPod(node, []string{"test-me-network"}) + DeferCleanup(namespaces.CleanPods, namespaces.Test, clients) + + namespacedMetricNames := []string{ + "network:sriov_vf_rx_bytes", + "network:sriov_vf_tx_bytes", + "network:sriov_vf_rx_packets", + "network:sriov_vf_tx_packets", + "network:sriov_vf_rx_dropped", + "network:sriov_vf_tx_dropped", + "network:sriov_vf_rx_broadcast", + "network:sriov_vf_rx_multicast", + } + + Eventually(func(g Gomega) { + for _, metricName := range namespacedMetricNames { + values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name)) + g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName) + } + }, "40s", "1s").Should(Succeed()) + }) }) func getMetricsForNode(nodeName string) map[string]*dto.MetricFamily { @@ -185,3 +217,33 @@ func areLabelsMatching(labels []*dto.LabelPair, labelsToMatch map[string]string) return true } + +func runPromQLQuery(query string) model.Vector { + prometheusPods, err := clients.Pods("").List(context.Background(), metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/component=prometheus", + }) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, prometheusPods.Items).ToNot(HaveLen(0), "At least one Prometheus operator pod expected") + + prometheusPod := prometheusPods.Items[0] + + url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode()) + command := []string{"curl", url} + stdout, stderr, err := pod.ExecCommand(clients, &prometheusPod, command...) + ExpectWithOffset(1, err).ToNot(HaveOccurred(), + "promQL query failed: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", prometheusPod.Namespace, prometheusPod.Name, command, stdout, stderr) + + result := struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result model.Vector `json:"result"` + } `json:"data"` + }{} + + json.Unmarshal([]byte(stdout), &result) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, result.Status).To(Equal("success"), "cURL for [%s] failed: %s", url, stdout) + + return result.Data.Result +} diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index 3c3cc4e84..100b0fdd5 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -311,6 +311,14 @@ var _ = Describe("[sriov] operator", func() { g.Expect(err).ToNot(HaveOccurred()) }).Should(Succeed()) }) + + It("should remove ServiceMonitor when the feature is turned off", func() { + setFeatureFlag("metricsExporter", false) + Eventually(func(g Gomega) { + _, err := clients.ServiceMonitors(operatorNamespace).Get(context.Background(), "sriov-network-metrics-exporter", metav1.GetOptions{}) + g.Expect(k8serrors.IsNotFound(err)).To(BeTrue()) + }).Should(Succeed()) + }) }) }) diff --git a/test/util/crds/monitoring.coreos.com_prometheusrules.yaml b/test/util/crds/monitoring.coreos.com_prometheusrules.yaml new file mode 100644 index 000000000..6c16e8396 --- /dev/null +++ b/test/util/crds/monitoring.coreos.com_prometheusrules.yaml @@ -0,0 +1,142 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.15.0 + operator.prometheus.io/version: 0.75.1 + name: prometheusrules.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + categories: + - prometheus-operator + kind: PrometheusRule + listKind: PrometheusRuleList + plural: prometheusrules + shortNames: + - promrule + singular: prometheusrule + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: |- + The `PrometheusRule` custom resource definition (CRD) defines [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules to be evaluated by `Prometheus` or `ThanosRuler` objects. + + + `Prometheus` and `ThanosRuler` objects select `PrometheusRule` objects using label and namespace selectors. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Specification of desired alerting rule definitions for Prometheus. + properties: + groups: + description: Content of Prometheus rule file + items: + description: RuleGroup is a list of sequentially evaluated recording + and alerting rules. + properties: + interval: + description: Interval determines how often rules in the group + are evaluated. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + limit: + description: |- + Limit the number of alerts an alerting rule and series a recording + rule can produce. + Limit is supported starting with Prometheus >= 2.31 and Thanos Ruler >= 0.24. + type: integer + name: + description: Name of the rule group. + minLength: 1 + type: string + partial_response_strategy: + description: |- + PartialResponseStrategy is only used by ThanosRuler and will + be ignored by Prometheus instances. + More info: https://github.com/thanos-io/thanos/blob/main/docs/components/rule.md#partial-response + pattern: ^(?i)(abort|warn)?$ + type: string + rules: + description: List of alerting and recording rules. + items: + description: |- + Rule describes an alerting or recording rule + See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule + properties: + alert: + description: |- + Name of the alert. Must be a valid label value. + Only one of `record` and `alert` must be set. + type: string + annotations: + additionalProperties: + type: string + description: |- + Annotations to add to each alert. + Only valid for alerting rules. + type: object + expr: + anyOf: + - type: integer + - type: string + description: PromQL expression to evaluate. + x-kubernetes-int-or-string: true + for: + description: Alerts are considered firing once they have + been returned for this long. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + keep_firing_for: + description: KeepFiringFor defines how long an alert will + continue firing after the condition that triggered it + has cleared. + minLength: 1 + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + labels: + additionalProperties: + type: string + description: Labels to add or overwrite. + type: object + record: + description: |- + Name of the time series to output to. Must be a valid metric name. + Only one of `record` and `alert` must be set. + type: string + required: + - expr + type: object + type: array + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true