From 9b8d388a77209de7c05a685418a99b58b28222aa Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 24 Sep 2024 12:03:21 -0700 Subject: [PATCH 1/4] temp add daemonset k8s config --- .../templates/otel-collector-config.yaml | 484 ++++++++++++++++++ 1 file changed, 484 insertions(+) diff --git a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml index df5695b17..d0c49e7ab 100644 --- a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml +++ b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml @@ -11,7 +11,455 @@ data: protocols: grpc: endpoint: $RECEIVER_ENDPOINT:4317 + hostmetrics: + # TODO (chris): this is a linux specific configuration + {{- if include "newrelic.common.privileged" . }} + root_path: /hostfs + {{- end }} + collection_interval: {{ .Values.receivers.hostmetrics.scrapeInterval }} + scrapers: + cpu: + metrics: + system.cpu.time: + enabled: false + system.cpu.utilization: + enabled: true + load: + memory: + metrics: + system.memory.utilization: + enabled: true + paging: + metrics: + system.paging.utilization: + enabled: false + system.paging.faults: + enabled: false + filesystem: + metrics: + system.filesystem.utilization: + enabled: true + disk: + metrics: + system.disk.merged: + enabled: false + system.disk.pending_operations: + enabled: false + system.disk.weighted_io_time: + enabled: false + network: + metrics: + system.network.connections: + enabled: false + # Uncomment to enable process metrics, which can be noisy but valuable. + # processes: + # process: + # metrics: + # process.cpu.utilization: + # enabled: true + # process.cpu.time: + # enabled: false + # mute_process_name_error: true + # mute_process_exe_error: true + # mute_process_io_error: true + # mute_process_user_error: true + kubeletstats: + collection_interval: {{ .Values.receivers.kubeletstats.scrapeInterval }} + {{- if include "newrelic.common.privileged" . }} + endpoint: "${KUBE_NODE_NAME}:10250" + auth_type: "serviceAccount" + insecure_skip_verify: true + {{- else }} + endpoint: "${KUBE_NODE_NAME}:10255" + auth_type: "none" + {{- end }} + metrics: + k8s.container.cpu_limit_utilization: + enabled: true + + + prometheus: + config: + scrape_configs: + - job_name: cadvisor + scrape_interval: {{ .Values.receivers.prometheus.scrapeInterval }} + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + - action: replace + target_label: job_label + replacement: cadvisor + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + - job_name: kubelet + scrape_interval: {{ .Values.receivers.prometheus.scrapeInterval }} + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + - action: replace + target_label: job_label + replacement: kubelet + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + processors: + metricstransform/ldm: + transforms: + - include: .* + match_type: regexp + action: update + operations: + - action: add_label + new_label: low.data.mode + new_value: 'false' + + metricstransform/kubeletstats: + transforms: + - include: container\.(cpu\.utilization|filesystem\.(capacity|usage)|memory\.usage) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: k8s\.node\.(cpu\.(time|utilization)|filesystem\.(capacity|usage)|memory\.(available|working_set)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: k8s\.pod\.(filesystem\.(available|capacity|usage)|memory\.(working_set)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + metricstransform/cadvisor: + transforms: + - include: container_cpu_(cfs_(periods_total|throttled_periods_total)|usage_seconds_total) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: container_memory_working_set_bytes + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: container_network_(working_set_bytes|receive_(bytes_total|errors_total)|transmit_(bytes_total|errors_total)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: container_spec_memory_limit_bytes + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + metricstransform/kubelet: + transforms: + - include: go_(goroutines|threads) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: process_resident_memory_bytes + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + metricstransform/hostmetrics: + transforms: + - include: process\.(cpu\.utilization|disk\.io|memory\.(usage|virtual)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.cpu\.(utilization|load_average\.(15m|1m|5m)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.disk\.(io_time|operation_time|operations) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.(filesystem|memory)\.(usage|utilization) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.network\.(errors|io|packets) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + + filter/exclude_metrics_low_data_mode: + metrics: + metric: + - 'HasAttrOnDatapoint("low.data.mode", "false")' + + transform/truncate: + log_statements: + - context: log + statements: + - truncate_all(attributes, 4095) + - truncate_all(resource.attributes, 4095) + + # group system.cpu metrics by cpu + metricstransform/hostmetrics_cpu: + transforms: + - include: system.cpu.utilization + action: update + operations: + - action: aggregate_labels + label_set: [ state ] + aggregation_type: mean + - include: system.paging.operations + action: update + operations: + - action: aggregate_labels + label_set: [ direction ] + aggregation_type: sum + + # following system.% metrics reduce metrics reported by hostmetrics receiver + filter/exclude_cpu_utilization: + metrics: + datapoint: + - 'metric.name == "system.cpu.utilization" and attributes["state"] == "interrupt"' + - 'metric.name == "system.cpu.utilization" and attributes["state"] == "nice"' + - 'metric.name == "system.cpu.utilization" and attributes["state"] == "softirq"' + filter/exclude_memory_utilization: + metrics: + datapoint: + - 'metric.name == "system.memory.utilization" and attributes["state"] == "slab_unreclaimable"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "inactive"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "cached"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "buffered"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "slab_reclaimable"' + filter/exclude_memory_usage: + metrics: + datapoint: + - 'metric.name == "system.memory.usage" and attributes["state"] == "slab_unreclaimable"' + - 'metric.name == "system.memory.usage" and attributes["state"] == "inactive"' + filter/exclude_filesystem_utilization: + metrics: + datapoint: + - 'metric.name == "system.filesystem.utilization" and attributes["type"] == "squashfs"' + filter/exclude_filesystem_usage: + metrics: + datapoint: + - 'metric.name == "system.filesystem.usage" and attributes["type"] == "squashfs"' + - 'metric.name == "system.filesystem.usage" and attributes["state"] == "reserved"' + filter/exclude_filesystem_inodes_usage: + metrics: + datapoint: + - 'metric.name == "system.filesystem.inodes.usage" and attributes["type"] == "squashfs"' + - 'metric.name == "system.filesystem.inodes.usage" and attributes["state"] == "reserved"' + filter/exclude_system_disk: + metrics: + datapoint: + - 'metric.name == "system.disk.operations" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.merged" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.io" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.io_time" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.operation_time" and IsMatch(attributes["device"], "^loop.*") == true' + filter/exclude_system_paging: + metrics: + datapoint: + - 'metric.name == "system.paging.usage" and attributes["state"] == "cached"' + - 'metric.name == "system.paging.operations" and attributes["type"] == "cached"' + filter/exclude_network: + metrics: + datapoint: + - 'IsMatch(metric.name, "^system.network.*") == true and attributes["device"] == "lo"' + + attributes/exclude_system_paging: + include: + match_type: strict + metric_names: + - system.paging.operations + actions: + - key: type + action: delete + + resourcedetection/env: + detectors: ["env", "system"] + override: false + system: + hostname_sources: ["os"] + resource_attributes: + host.id: + enabled: true + + resourcedetection/cloudproviders: + detectors: [gcp, eks, azure, aks, ec2, ecs] + timeout: 2s + override: false + ec2: + resource_attributes: + host.name: + enabled: false + + resource: + attributes: + - key: host.id + from_attribute: host.name + action: upsert + # TODO (chris): Upsert only when cluster name not found (resource detection override: true) + - key: k8s.cluster.name + action: upsert + value: {{ include "newrelic.common.cluster" . }} + - key: newrelicOnly + action: upsert + value: 'true' + - key: service.name + action: delete + - key: service_name + action: delete + + transform/low_data_mode_inator: + metric_statements: + - context: metric + statements: + - set(description, "") + - set(unit, "") + + resource/low_data_mode_inator: + attributes: + - key: http.scheme + action: delete + - key: net.host.name + action: delete + - key: net.host.port + action: delete + + k8sattributes: + auth_type: "serviceAccount" + passthrough: false + filter: + node_from_env_var: KUBE_NODE_NAME + extract: + metadata: + - k8s.pod.name + - k8s.pod.uid + - k8s.deployment.name + - k8s.namespace.name + - k8s.node.name + - k8s.pod.start_time + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.uid + + attributes/self: + actions: + - key: k8s.pod.name + action: update + from_attribute: pod + - key: k8s.deployment.name + action: update + from_attribute: deployment + - key: k8s.node.name + action: update + from_attribute: node + - key: k8s.namespace.name + action: upsert + from_attribute: namespace + + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 25 + + cumulativetodelta: + + batch: + send_batch_max_size: 1000 + timeout: 30s + send_batch_size : 800 + + k8sattributes/local_k8s_md: auth_type: 'serviceAccount' filter: # Perform the lookup for pod association on only the Node the OTel collector is running on. @@ -132,4 +580,40 @@ data: - resource/setup_for_export exporters: - otlp + metrics-k8s: + receivers: + - hostmetrics + - kubeletstats + - prometheus + processors: + - metricstransform/ldm + - metricstransform/kubeletstats + - metricstransform/cadvisor + - metricstransform/kubelet + - metricstransform/hostmetrics + - filter/exclude_metrics_low_data_mode + - metricstransform/hostmetrics_cpu + - transform/truncate + - filter/exclude_cpu_utilization + - filter/exclude_memory_utilization + - filter/exclude_memory_usage + - filter/exclude_filesystem_utilization + - filter/exclude_filesystem_usage + - filter/exclude_filesystem_inodes_usage + - filter/exclude_system_disk + - filter/exclude_system_paging + - filter/exclude_network + - attributes/exclude_system_paging + - resourcedetection/env + - resourcedetection/cloudproviders + - resource + - transform/low_data_mode_inator + - resource/low_data_mode_inator + - k8sattributes + - attributes/self + - memory_limiter + - cumulativetodelta + - batch + exporters: + - otlphttp/newrelic clusterName: {{ required "Please set the Kubernetes cluster name" .Values.cluster | quote }} From ca677f768bc66b4933bf2ccf6c512bf111f5c1e6 Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 24 Sep 2024 12:23:58 -0700 Subject: [PATCH 2/4] remove irrelevant code --- .../templates/otel-collector-config.yaml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml index d0c49e7ab..60ad03657 100644 --- a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml +++ b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml @@ -12,11 +12,8 @@ data: grpc: endpoint: $RECEIVER_ENDPOINT:4317 hostmetrics: - # TODO (chris): this is a linux specific configuration - {{- if include "newrelic.common.privileged" . }} root_path: /hostfs - {{- end }} - collection_interval: {{ .Values.receivers.hostmetrics.scrapeInterval }} + collection_interval: 60s scrapers: cpu: metrics: @@ -64,15 +61,10 @@ data: # mute_process_io_error: true # mute_process_user_error: true kubeletstats: - collection_interval: {{ .Values.receivers.kubeletstats.scrapeInterval }} - {{- if include "newrelic.common.privileged" . }} + collection_interval: 60s endpoint: "${KUBE_NODE_NAME}:10250" auth_type: "serviceAccount" insecure_skip_verify: true - {{- else }} - endpoint: "${KUBE_NODE_NAME}:10255" - auth_type: "none" - {{- end }} metrics: k8s.container.cpu_limit_utilization: enabled: true @@ -389,7 +381,7 @@ data: # TODO (chris): Upsert only when cluster name not found (resource detection override: true) - key: k8s.cluster.name action: upsert - value: {{ include "newrelic.common.cluster" . }} + value: $CLUSTER_NAME - key: newrelicOnly action: upsert value: 'true' From 65932d00349005841f6056e9e31918d82fd695c8 Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 24 Sep 2024 13:39:12 -0700 Subject: [PATCH 3/4] indentation --- .../templates/otel-collector-config.yaml | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml index 60ad03657..77a0fe1f9 100644 --- a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml +++ b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml @@ -574,38 +574,38 @@ data: - otlp metrics-k8s: receivers: - - hostmetrics - - kubeletstats - - prometheus - processors: - - metricstransform/ldm - - metricstransform/kubeletstats - - metricstransform/cadvisor - - metricstransform/kubelet - - metricstransform/hostmetrics - - filter/exclude_metrics_low_data_mode - - metricstransform/hostmetrics_cpu - - transform/truncate - - filter/exclude_cpu_utilization - - filter/exclude_memory_utilization - - filter/exclude_memory_usage - - filter/exclude_filesystem_utilization - - filter/exclude_filesystem_usage - - filter/exclude_filesystem_inodes_usage - - filter/exclude_system_disk - - filter/exclude_system_paging - - filter/exclude_network - - attributes/exclude_system_paging - - resourcedetection/env - - resourcedetection/cloudproviders - - resource - - transform/low_data_mode_inator - - resource/low_data_mode_inator - - k8sattributes - - attributes/self - - memory_limiter - - cumulativetodelta - - batch - exporters: - - otlphttp/newrelic + - hostmetrics + - kubeletstats + - prometheus + processors: + - metricstransform/ldm + - metricstransform/kubeletstats + - metricstransform/cadvisor + - metricstransform/kubelet + - metricstransform/hostmetrics + - filter/exclude_metrics_low_data_mode + - metricstransform/hostmetrics_cpu + - transform/truncate + - filter/exclude_cpu_utilization + - filter/exclude_memory_utilization + - filter/exclude_memory_usage + - filter/exclude_filesystem_utilization + - filter/exclude_filesystem_usage + - filter/exclude_filesystem_inodes_usage + - filter/exclude_system_disk + - filter/exclude_system_paging + - filter/exclude_network + - attributes/exclude_system_paging + - resourcedetection/env + - resourcedetection/cloudproviders + - resource + - transform/low_data_mode_inator + - resource/low_data_mode_inator + - k8sattributes + - attributes/self + - memory_limiter + - cumulativetodelta + - batch + exporters: + - otlphttp/newrelic clusterName: {{ required "Please set the Kubernetes cluster name" .Values.cluster | quote }} From 6a0d058ddddc26ff95cd033b49f1bdec5956d171 Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 24 Sep 2024 14:12:52 -0700 Subject: [PATCH 4/4] host volume --- .../nr-ebpf-agent/templates/otel-collector-daemonset.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml b/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml index 97a656849..43ba6939e 100644 --- a/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml +++ b/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml @@ -60,6 +60,12 @@ spec: mountPath: /etc/otel/config.yaml subPath: config.yaml readOnly: true + - name: host-fs + mountPath: /hostfs + readOnly: true + - name: varlogpods + mountPath: /var/log/pods + readOnly: true dnsPolicy: ClusterFirstWithHostNet hostNetwork: true serviceAccountName: {{ include "nr-ebpf-agent.fullname" . }}-collector