-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcentral_monitor.yaml
500 lines (451 loc) · 17.3 KB
/
central_monitor.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
## Deploy node exporter as a daemonset to all nodes
##
nodeExporter:
enabled: false
prometheus:
enabled: false
prometheusOperator:
enabled: false
alertmanager:
enabled: false
## Flag to disable all the kubernetes component scrapers
kubernetesServiceMonitors:
enabled: false
kubeStateMetrics: ## Component scraping kube state metrics
enabled: false
###################################
# Centrailized Monitoring Host
###################################
grafana:
enabled: true
namespaceOverride: ""
forceDeployDatasources: true ## ForceDeployDatasources Create datasource configmap even if grafana deployment has been disabled
forceDeployDashboards: true ## ForceDeployDashboard Create dashboard configmap even if grafana deployment has been disabled
defaultDashboardsEnabled: true ## Deploy default dashboards
defaultDashboardsTimezone: Asia/Seoul ## Timezone for the default dashboards
defaultDashboardsEditable: false
adminPassword: ${GRAFANA_ADMIN}
rbac:
pspEnabled: false ## If true, Grafana PSPs will be created
ingress:
enabled: true
annotations:
spec.ingressClassName: traefik # k3s default ingress controller
labels: {}
hosts:
- monitor.wai
path: /
tls: []
# grafana/grafana의 default pv는 초기 프로비저닝 용도이며, helm uninstall시 함께 삭제된다.
# pv 정책이 "persistentVolumeReclaimPolicy: Delete" 로 고정되어있어서 value로 정책변경불가
# 대신 persistence.type을 statefulset으로 설정하면 helm uninstall해도 데이터가 보존됨!
# 이 상태에서 pv 삭제를 원할시, namespace에 있는 pvc를 삭제하면됨
persistence:
# Type of persistence (pvc or statefulset)
type: statefulset
enabled: false
# storageClassName: default
accessModes:
- ReadWriteOnce
size: 10Gi
# annotations: {}
finalizers:
- kubernetes.io/pvc-protection
# selectorLabels: {}
## Sub-directory of the PV to mount. Can be templated.
# subPath: ""
## Use an existing PVC to persist data (can be templated)
# existingClaim:
## Extra labels to apply to a PVC.
extraPvcLabels: {}
## If persistence is not enabled, this allows to mount the
## local storage in-memory to improve performance
##
inMemory:
enabled: false
## The maximum usage on memory medium EmptyDir would be
## the minimum value between the SizeLimit specified
## here and the sum of memory limits of all containers in a pod
##
# sizeLimit: 300Mi
## Additional grafana server volume mounts
# Defines additional volume mounts.
# extraVolumeMounts: []
# - name: extra-volume-0
# mountPath: /mnt/volume0
# readOnly: true
# - name: extra-volume-1
# mountPath: /mnt/volume1
# readOnly: true
# - name: grafana-secrets
# mountPath: /mnt/volume2
## Additional Grafana server volumes
# extraVolumes: []
# - name: extra-volume-0
# existingClaim: volume-claim
# - name: extra-volume-1
# hostPath:
# path: /usr/shared/
# type: ""
# - name: grafana-secrets
# csi:
# driver: secrets-store.csi.k8s.io
# readOnly: true
# volumeAttributes:
# secretProviderClass: "grafana-env-spc"
## Configure grafana dashboard providers
## ref: http://docs.grafana.org/administration/provisioning/#dashboards
##
## `path` must be /var/lib/grafana/dashboards/<provider_name>
##
dashboardProviders: # {}
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'my-default'
orgId: 1
folder: 'multi-cluster-node-exporter'
type: file
disableDeletion: true
editable: false
options:
# 이 경로에 대시보드 json이 저장되고, Grafana UI에서 Provisioned로 취급되어 변경&삭제 불가
# PVC 사용시, 매니페스트에서 대시보드를 제거해도 이전내역이 남아있는데, 이 경로의 json파일을 수동삭제하면 UI에서도 삭제가능해진다.
path: /var/lib/grafana/dashboards/my-default
# - name: 'default'
# orgId: 1
# folder: ''
# type: file
# disableDeletion: false
# editable: true
# options:
# path: /var/lib/grafana/dashboards/default
## Configure grafana dashboard to import
## NOTE: To use dashboards you must also enable/configure dashboardProviders
## ref: https://grafana.com/dashboards
##
## dashboards per provider, use provider name as key.
##
dashboards: # {}
my-default:
11074_9_prom_multi_k8s_grafana11: # 22413 # 여러 Datasource 참조할 수 있도록 수정, 개별노드 페이지 쿼리 수정, 일부 패널 해상도 조정
url: https://raw.githubusercontent.com/YunanJeong/cmm/refs/heads/main/dashboards/11074_9_prom_multi_k8s_grafana11.json
# datasource: '$datasource'
nodes:
url: https://raw.githubusercontent.com/YunanJeong/cmm/refs/heads/main/dashboards/nodes.json
node-cluster-rsrc-use:
url: https://raw.githubusercontent.com/YunanJeong/cmm/refs/heads/main/dashboards/node-cluster-rsrc-use.json
node-exporter-full:
gnetId: 1860
revision: 32
# default:
# some-dashboard:
# json: |
# $RAW_JSON
# custom-dashboard:
# file: dashboards/custom-dashboard.json
# prometheus-stats:
# gnetId: 2
# revision: 2
# datasource: Prometheus
# local-dashboard:
# url: https://example.com/repository/test.json
# token: ''
# local-dashboard-base64:
# url: https://example.com/repository/test-b64.json
# token: ''
# b64content: true
# local-dashboard-gitlab:
# url: https://example.com/repository/test-gitlab.json # gitlab raw경로 형식 참고: https://docs.gitlab.com/ee/api/repository_files.html#get-raw-file-from-repository
# gitlabToken: '' # ACCESS-TOKEN에서 read-repository 권한범위 부여
# local-dashboard-bitbucket:
# url: https://example.com/repository/test-bitbucket.json
# bearerToken: ''
# local-dashboard-azure:
# url: https://example.com/repository/test-azure.json
# basic: ''
# acceptHeader: '*/*'
## Reference to external ConfigMap per provider. Use provider name as key and ConfigMap name as value.
## A provider dashboards must be defined either by external ConfigMaps or in values.yaml, not in both.
## ConfigMap data example:
##
## data:
## example-dashboard.json: |
## RAW_JSON
##
dashboardsConfigMaps: {}
# default: ""
sidecar:
dashboards: # 대시보드 사전 설정하기 (grafana-sc-dashboard)
enabled: true # true시, kube-prometheus-stack의 기본제공 대시보드 or 커스텀 대시보드를 활성화
label: grafana_dashboard # 커스텀 대시보드 사전 설정하기 (별도 configmap 생성 필요)
labelValue: "1" # configmap 매니페스트에서,
# metadata.labels 섹션에 label-labelValue와 동일한 key-value를 입력
# data섹션에 대시보드 설정json을 입력
# 이후 configmap 배포시 대시보드가 배포된다. (오류시 sidecar만 재실행)
# configmap은 "value파일 제일 아래 extraManifests섹션"이나 별도 helm template 등을 활용가능
# Allow discovery in all namespaces for dashboards
searchNamespace: ALL
## Annotations for Grafana dashboard configmaps
##
annotations: {}
multicluster:
global:
enabled: false # default: false ?????????????????????????????
etcd:
enabled: false # default: false ?????????????????????????????
provider:
allowUiUpdates: true
# datasource 사전 설정하기 (grafana)
datasources:
enabled: true
defaultDatasourceEnabled: false # 기본 Data Source 생성 (grafana와 동일 cluster에서, 동일 helm release로 생성된 Prometheus 주소를 참조)
isDefaultDatasource: false #
uid: prometheus
## URL of prometheus datasource
##
# url: http://prometheus-stack-prometheus:9090/
## Prometheus request timeout in seconds
# timeout: 30
# If not defined, will use prometheus.prometheusSpec.scrapeInterval or its default
# defaultDatasourceScrapeInterval: 15s
## Annotations for Grafana datasource configmaps
##
annotations: {}
## Set method for HTTP to send query to datasource
httpMethod: POST
## Create datasource for each Pod of Prometheus StatefulSet;
## this uses headless service `prometheus-operated` which is
## created by Prometheus Operator
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/0fee93e12dc7c2ea1218f19ae25ec6b893460590/pkg/prometheus/statefulset.go#L255-L286
createPrometheusReplicasDatasources: false
label: grafana_datasource
labelValue: "1"
## Field with internal link pointing to existing data source in Grafana.
## Can be provisioned via additionalDataSources
exemplarTraceIdDestinations: {}
# datasourceUid: Jaeger
# traceIdLabelName: trace_id
alertmanager:
enabled: true
uid: alertmanager
handleGrafanaManagedAlerts: false
implementation: prometheus
extraConfigmapMounts: []
# - name: certs-configmap
# mountPath: /etc/grafana/ssl/
# configMap: certs-configmap
# readOnly: true
deleteDatasources: []
# - name: example-datasource
# orgId: 1
## Configure additional grafana datasources (passed through tpl)
## ref: http://docs.grafana.org/administration/provisioning/#datasources
# grafana.sidecar.datasources.enabled=true 필요
additionalDataSources: # []
# EKS
- name: eks-prometheus
access: proxy
basicAuth: false
editable: false
jsonData:
tlsSkipVerify: true
orgId: 1
type: prometheus
url: http://${MY_EKS_DNS}:9090 # http/https 구분 기입 필요 # Public IP 노출주의
version: 1 # PVC 사용중 일부 파라미터 upgrade시 미반영될 수 있음. version을 변경해주면 정상반영됨.
isDefault: true
- name: eks-loki
access: proxy
basicAuth: false
editable: false
jsonData:
tlsSkipVerify: true
maxLines: 5000 # grafana에서 제한하는 Loki 쿼리 라인 수. 5000 넘게 설정하려면 Loki 자체에서 쿼리제한도 풀어줘야 함
orgId: 1
type: loki
url: http://${MY_EKS_LOKI}:3100 # http/https 구분 기입 필요 # Public IP 노출주의
- name: kr-rag-prometheus
access: proxy
basicAuth: false
editable: false
jsonData:
tlsSkipVerify: true
orgId: 1
type: prometheus
url: http://172.31.58.245:9090 # http/https 구분 기입 필요 # Public IP 노출주의
version: 1
- name: kr-mum-prometheus
access: proxy
basicAuth: false
editable: false
jsonData:
tlsSkipVerify: true
orgId: 1
type: prometheus
url: http://172.31.4.123:9090 # http/https 구분 기입 필요 # Public IP 노출주의
version: 1
# - name: prometheus-sample
# access: proxy
# basicAuth: true
# basicAuthPassword: pass
# basicAuthUser: daco
# editable: false
# jsonData:
# tlsSkipVerify: true
# orgId: 1
# type: prometheus
# url: https://{{ printf "%s-prometheus.svc" .Release.Name }}:9090
# version: 1
## Passed to grafana subchart and used by servicemonitor below
##
service:
type: NodePort # LoadBalancer
nodePort: 30000
portName: http-web
serviceMonitor:
enabled: false
## Configure grafana alerting (can be templated)
## ref: http://docs.grafana.org/administration/provisioning/#alerting
##
alerting: # {}
delete_rules.yaml:
# 기존 Rule의 uid를 지정하여 삭제가능 # uid는 rule json에서 확인가능
# 특히, 프로비저닝된 Rule은 이 방법으로만 삭제가능
# 삭제방법이 번거로우므로 가급적 신규 rule 생성은 프로비저닝말고 Web UI에서 하자
deleteRules:
- orgId: 1
uid: aa1639d3-00c7-4cde-b12f-xxxxxxxxxxxx
- orgId: 1
uid: f8b222fe-870f-4342-abee-xxxxxxxxxxxx
rules.yaml:
# AlertRule 프로비저닝 예시
# 특정 Prometheus를 통해, 메모리 및 스토리지 점유율이 일정수치 이상인 노드를 검출
# AlertRule은 UI작업이 훨씬 편하고, Json Export 가능, 단, UI에서 import 불가능
# Json으로 AlertRule을 등록하려면,rules.yaml 형식으로 프로비저닝 하거나 grafana server REST API 사용해야 함
apiVersion: 1
groups:
- orgId: 1
name: node_check_every_1m
folder: my-sample-alerts
interval: 1m
rules:
- uid: my-memory-uid
title: my-memory
condition: A
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
datasource:
type: prometheus
disableTextWrap: false
editorMode: code
exemplar: false
# 메모리 85% 초과하는 노드 검색 Prometheus 쿼리
expr: 100 * (1 - ((node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) / node_memory_MemTotal_bytes{job="node-exporter"})) > 85
format: time_series
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: my-memory-is-dangerous
labels:
memory: warning-sample
isPaused: true # 정지
- uid: my-storage-uid
title: my-storage
condition: A
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
datasource:
type: prometheus
disableTextWrap: false
editorMode: code
exemplar: false
# 스토리지 85% 초과하는 노드 검색 Prometheus 쿼리
expr: 100 * (1 - (node_filesystem_free_bytes{job="node-exporter",device="/dev/root"} / node_filesystem_size_bytes{job="node-exporter",device="/dev/root"})) > 85
format: time_series
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: my-storage-is-dangerous
labels:
storage: warning-sample
isPaused: true # 정지
contactpoints.yaml:
secret:
apiVersion: 1
contactPoints:
- orgId: 1
name: my-teams-channel
receivers:
- uid: teams-channel
type: teams
settings:
url: ${TEAMS_WEBHOOK}
disableResolveMessage: false
- orgId: 1
name: my-line-channel
receivers:
- uid: line-channel
type: LINE
settings:
token: ${LINE_TOKEN}
disableResolveMessage: false
## Configure notifiers
## ref: http://docs.grafana.org/administration/provisioning/#alert-notification-channels
##
notifiers: {}
# notifiers.yaml:
# notifiers:
# - name: email-notifier
# type: email
# uid: email1
# # either:
# org_id: 1
# # or
# org_name: Main Org.
# is_default: true
# settings:
# addresses: an_email_address@example.com
# delete_notifiers:
##
## Extra manifests to deploy as an array
extraManifests: []
# - apiVersion: v1
# kind: ConfigMap
# metadata:
# name: dasboard-configmap-test
# namespace: "{{ .Release.Namespace }}"
# labels:
# grafana_dashboard: "1"
# # tpl이 적용되므로 직접 대시보드 json을 기입하면 대괄호 때문에 파싱에러 가능성이 큼
# data:
# # 파일 참조는 차트 내부 디렉토리 기준임.
# nodes.json: '{{ .Files.Get "dashboards/nodes_line.json" | toJson }}'