diff --git a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml index df5695b17..77a0fe1f9 100644 --- a/charts/nr-ebpf-agent/templates/otel-collector-config.yaml +++ b/charts/nr-ebpf-agent/templates/otel-collector-config.yaml @@ -11,7 +11,447 @@ data: protocols: grpc: endpoint: $RECEIVER_ENDPOINT:4317 + hostmetrics: + root_path: /hostfs + collection_interval: 60s + scrapers: + cpu: + metrics: + system.cpu.time: + enabled: false + system.cpu.utilization: + enabled: true + load: + memory: + metrics: + system.memory.utilization: + enabled: true + paging: + metrics: + system.paging.utilization: + enabled: false + system.paging.faults: + enabled: false + filesystem: + metrics: + system.filesystem.utilization: + enabled: true + disk: + metrics: + system.disk.merged: + enabled: false + system.disk.pending_operations: + enabled: false + system.disk.weighted_io_time: + enabled: false + network: + metrics: + system.network.connections: + enabled: false + # Uncomment to enable process metrics, which can be noisy but valuable. + # processes: + # process: + # metrics: + # process.cpu.utilization: + # enabled: true + # process.cpu.time: + # enabled: false + # mute_process_name_error: true + # mute_process_exe_error: true + # mute_process_io_error: true + # mute_process_user_error: true + kubeletstats: + collection_interval: 60s + endpoint: "${KUBE_NODE_NAME}:10250" + auth_type: "serviceAccount" + insecure_skip_verify: true + metrics: + k8s.container.cpu_limit_utilization: + enabled: true + + + prometheus: + config: + scrape_configs: + - job_name: cadvisor + scrape_interval: {{ .Values.receivers.prometheus.scrapeInterval }} + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + - action: replace + target_label: job_label + replacement: cadvisor + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + - job_name: kubelet + scrape_interval: {{ .Values.receivers.prometheus.scrapeInterval }} + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - replacement: kubernetes.default.svc.cluster.local:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$${1}/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + - action: replace + target_label: job_label + replacement: kubelet + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + server_name: kubernetes + processors: + metricstransform/ldm: + transforms: + - include: .* + match_type: regexp + action: update + operations: + - action: add_label + new_label: low.data.mode + new_value: 'false' + + metricstransform/kubeletstats: + transforms: + - include: container\.(cpu\.utilization|filesystem\.(capacity|usage)|memory\.usage) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: k8s\.node\.(cpu\.(time|utilization)|filesystem\.(capacity|usage)|memory\.(available|working_set)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: k8s\.pod\.(filesystem\.(available|capacity|usage)|memory\.(working_set)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + metricstransform/cadvisor: + transforms: + - include: container_cpu_(cfs_(periods_total|throttled_periods_total)|usage_seconds_total) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: container_memory_working_set_bytes + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: container_network_(working_set_bytes|receive_(bytes_total|errors_total)|transmit_(bytes_total|errors_total)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: container_spec_memory_limit_bytes + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + metricstransform/kubelet: + transforms: + - include: go_(goroutines|threads) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: process_resident_memory_bytes + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + metricstransform/hostmetrics: + transforms: + - include: process\.(cpu\.utilization|disk\.io|memory\.(usage|virtual)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.cpu\.(utilization|load_average\.(15m|1m|5m)) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.disk\.(io_time|operation_time|operations) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.(filesystem|memory)\.(usage|utilization) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + - include: system\.network\.(errors|io|packets) + action: update + match_type: regexp + operations: + - action: update_label + label: low.data.mode + value_actions: + - value: 'false' + new_value: 'true' + + + filter/exclude_metrics_low_data_mode: + metrics: + metric: + - 'HasAttrOnDatapoint("low.data.mode", "false")' + + transform/truncate: + log_statements: + - context: log + statements: + - truncate_all(attributes, 4095) + - truncate_all(resource.attributes, 4095) + + # group system.cpu metrics by cpu + metricstransform/hostmetrics_cpu: + transforms: + - include: system.cpu.utilization + action: update + operations: + - action: aggregate_labels + label_set: [ state ] + aggregation_type: mean + - include: system.paging.operations + action: update + operations: + - action: aggregate_labels + label_set: [ direction ] + aggregation_type: sum + + # following system.% metrics reduce metrics reported by hostmetrics receiver + filter/exclude_cpu_utilization: + metrics: + datapoint: + - 'metric.name == "system.cpu.utilization" and attributes["state"] == "interrupt"' + - 'metric.name == "system.cpu.utilization" and attributes["state"] == "nice"' + - 'metric.name == "system.cpu.utilization" and attributes["state"] == "softirq"' + filter/exclude_memory_utilization: + metrics: + datapoint: + - 'metric.name == "system.memory.utilization" and attributes["state"] == "slab_unreclaimable"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "inactive"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "cached"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "buffered"' + - 'metric.name == "system.memory.utilization" and attributes["state"] == "slab_reclaimable"' + filter/exclude_memory_usage: + metrics: + datapoint: + - 'metric.name == "system.memory.usage" and attributes["state"] == "slab_unreclaimable"' + - 'metric.name == "system.memory.usage" and attributes["state"] == "inactive"' + filter/exclude_filesystem_utilization: + metrics: + datapoint: + - 'metric.name == "system.filesystem.utilization" and attributes["type"] == "squashfs"' + filter/exclude_filesystem_usage: + metrics: + datapoint: + - 'metric.name == "system.filesystem.usage" and attributes["type"] == "squashfs"' + - 'metric.name == "system.filesystem.usage" and attributes["state"] == "reserved"' + filter/exclude_filesystem_inodes_usage: + metrics: + datapoint: + - 'metric.name == "system.filesystem.inodes.usage" and attributes["type"] == "squashfs"' + - 'metric.name == "system.filesystem.inodes.usage" and attributes["state"] == "reserved"' + filter/exclude_system_disk: + metrics: + datapoint: + - 'metric.name == "system.disk.operations" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.merged" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.io" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.io_time" and IsMatch(attributes["device"], "^loop.*") == true' + - 'metric.name == "system.disk.operation_time" and IsMatch(attributes["device"], "^loop.*") == true' + filter/exclude_system_paging: + metrics: + datapoint: + - 'metric.name == "system.paging.usage" and attributes["state"] == "cached"' + - 'metric.name == "system.paging.operations" and attributes["type"] == "cached"' + filter/exclude_network: + metrics: + datapoint: + - 'IsMatch(metric.name, "^system.network.*") == true and attributes["device"] == "lo"' + + attributes/exclude_system_paging: + include: + match_type: strict + metric_names: + - system.paging.operations + actions: + - key: type + action: delete + + resourcedetection/env: + detectors: ["env", "system"] + override: false + system: + hostname_sources: ["os"] + resource_attributes: + host.id: + enabled: true + + resourcedetection/cloudproviders: + detectors: [gcp, eks, azure, aks, ec2, ecs] + timeout: 2s + override: false + ec2: + resource_attributes: + host.name: + enabled: false + + resource: + attributes: + - key: host.id + from_attribute: host.name + action: upsert + # TODO (chris): Upsert only when cluster name not found (resource detection override: true) + - key: k8s.cluster.name + action: upsert + value: $CLUSTER_NAME + - key: newrelicOnly + action: upsert + value: 'true' + - key: service.name + action: delete + - key: service_name + action: delete + + transform/low_data_mode_inator: + metric_statements: + - context: metric + statements: + - set(description, "") + - set(unit, "") + + resource/low_data_mode_inator: + attributes: + - key: http.scheme + action: delete + - key: net.host.name + action: delete + - key: net.host.port + action: delete + + k8sattributes: + auth_type: "serviceAccount" + passthrough: false + filter: + node_from_env_var: KUBE_NODE_NAME + extract: + metadata: + - k8s.pod.name + - k8s.pod.uid + - k8s.deployment.name + - k8s.namespace.name + - k8s.node.name + - k8s.pod.start_time + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.uid + + attributes/self: + actions: + - key: k8s.pod.name + action: update + from_attribute: pod + - key: k8s.deployment.name + action: update + from_attribute: deployment + - key: k8s.node.name + action: update + from_attribute: node + - key: k8s.namespace.name + action: upsert + from_attribute: namespace + + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 25 + + cumulativetodelta: + + batch: + send_batch_max_size: 1000 + timeout: 30s + send_batch_size : 800 + + k8sattributes/local_k8s_md: auth_type: 'serviceAccount' filter: # Perform the lookup for pod association on only the Node the OTel collector is running on. @@ -132,4 +572,40 @@ data: - resource/setup_for_export exporters: - otlp + metrics-k8s: + receivers: + - hostmetrics + - kubeletstats + - prometheus + processors: + - metricstransform/ldm + - metricstransform/kubeletstats + - metricstransform/cadvisor + - metricstransform/kubelet + - metricstransform/hostmetrics + - filter/exclude_metrics_low_data_mode + - metricstransform/hostmetrics_cpu + - transform/truncate + - filter/exclude_cpu_utilization + - filter/exclude_memory_utilization + - filter/exclude_memory_usage + - filter/exclude_filesystem_utilization + - filter/exclude_filesystem_usage + - filter/exclude_filesystem_inodes_usage + - filter/exclude_system_disk + - filter/exclude_system_paging + - filter/exclude_network + - attributes/exclude_system_paging + - resourcedetection/env + - resourcedetection/cloudproviders + - resource + - transform/low_data_mode_inator + - resource/low_data_mode_inator + - k8sattributes + - attributes/self + - memory_limiter + - cumulativetodelta + - batch + exporters: + - otlphttp/newrelic clusterName: {{ required "Please set the Kubernetes cluster name" .Values.cluster | quote }} diff --git a/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml b/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml index 97a656849..43ba6939e 100644 --- a/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml +++ b/charts/nr-ebpf-agent/templates/otel-collector-daemonset.yaml @@ -60,6 +60,12 @@ spec: mountPath: /etc/otel/config.yaml subPath: config.yaml readOnly: true + - name: host-fs + mountPath: /hostfs + readOnly: true + - name: varlogpods + mountPath: /var/log/pods + readOnly: true dnsPolicy: ClusterFirstWithHostNet hostNetwork: true serviceAccountName: {{ include "nr-ebpf-agent.fullname" . }}-collector