Skip to content

Commit

Permalink
Add liveness and readiness probes to koperator's manager pod (#1050)
Browse files Browse the repository at this point in the history
* feat: add liveness/readiness probes

* add extra dash

* remove unnecessary quotes

* move import healthz import to the third parties

* relocate the liveness and readiness probe definitions in manager.yaml file

* rename health-probes to liveness-readiness-probe

* add missing equal sign

* relocate wrongly placed error block

* rename healthProbe to livenessReadinessProbe in helm charts

* rename liveness-readiness-probe to health-probes because of k8s's name length limitations

* rename liveness-readiness-probe to health-probes because of k8s's name length limitations

* revert rename of health-probe

* remove redundant default value for health probes port definition

* remove redundant default value for health probes port definition

* remove redundant default value for health probes port definition

* remove redundant default value for health probes port definition

* add failure threshold because of e2e tests

* increase the number of failure threshold

---------

Co-authored-by: Marton Barta <51166675+bartam1@users.noreply.github.com>
  • Loading branch information
matewolf and bartam1 authored Aug 31, 2023
1 parent 2feac63 commit a6f7678
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,26 @@ spec:
{{- if (.Values.metricEndpoint).port }}
- --metrics-addr=":{{ .Values.metricEndpoint.port }}"
{{- end }}
{{- if .Values.healthProbes.port }}
- --health-probes-addr=:{{ .Values.healthProbes.port }}
{{- end }}
image: "{{ .Values.operator.image.repository }}:{{ .Values.operator.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.operator.image.pullPolicy }}
name: manager
livenessProbe:
initialDelaySeconds: 15
periodSeconds: 10
failureThreshold: 100
httpGet:
port: health-probes
path: /healthz
readinessProbe:
initialDelaySeconds: 20
periodSeconds: 15
failureThreshold: 100
httpGet:
port: health-probes
path: /readyz
env:
- name: POD_NAMESPACE
valueFrom:
Expand All @@ -212,25 +229,28 @@ spec:
{{ toYaml .Values.additionalEnv | nindent 12 }}
{{- end }}
ports:
{{- if .Values.webhook.enabled }}
{{- if .Values.webhook.enabled }}
- containerPort: {{ .Values.webhook.serverPort | default 9443 }}
name: webhook-server
protocol: TCP
{{- end }}
{{- end}}
- containerPort: {{ (.Values.metricEndpoint).port | default 8080 }}
name: metrics
protocol: TCP
- containerPort: {{ .Values.alertManager.port }}
name: alerts
protocol: TCP
- containerPort: {{ .Values.healthProbes.port | default 8081 }}
name: health-probes
protocol: TCP
volumeMounts:
{{- if .Values.webhook.enabled }}
- mountPath: {{ (.Values.webhook.tls).certDir | default "/etc/webhook/certs" }}
name: serving-cert
readOnly: true
{{- end }}
resources:
{{ toYaml .Values.operator.resources | nindent 12 }}
{{- toYaml .Values.operator.resources | nindent 12 }}
{{- if .Values.containerSecurityContext }}
securityContext:
{{ toYaml .Values.containerSecurityContext | nindent 12 }}
Expand Down
7 changes: 5 additions & 2 deletions charts/kafka-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ operator:
# configurable Kubernetes namespaces.
# In this scenario, users can replace the default
# ClusterRole and ClusterRoleBinding to Role and RoleBinding respectively.
# When this field is not empty and Cert-manager is used,
# When this field is not empty and Cert-manager is used,
# the Cert-manager's Custom Resource Namespace must be included in the comma separated list.
# When it is empty, all namespaces will be watched.
# When it is empty, all namespaces will be watched.
namespaces: ""
verboseLogging: false
developmentLogging: false
Expand Down Expand Up @@ -73,6 +73,9 @@ prometheusMetrics:
create: true
name: kafka-operator-authproxy

healthProbes: {}
# port:

#metricEndpoint:
# port:

Expand Down
15 changes: 15 additions & 0 deletions config/base/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,25 @@ spec:
- --enable-leader-election
image: ghcr.io/banzaicloud/kafka-operator:latest
name: manager
livenessProbe:
initialDelaySeconds: 15
periodSeconds: 10
httpGet:
port: health-probes
path: /healthz
readinessProbe:
initialDelaySeconds: 20
periodSeconds: 15
httpGet:
port: health-probes
path: /readyz
ports:
- containerPort: 9001
name: alerts
protocol: TCP
- containerPort: 8081
name: health-probes
protocol: TCP
resources:
limits:
cpu: 300m
Expand Down
29 changes: 21 additions & 8 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"strings"

"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/healthz"

istioclientv1beta1 "github.com/banzaicloud/istio-client-go/pkg/networking/v1beta1"

Expand Down Expand Up @@ -89,6 +90,7 @@ func main() {
certSigningDisabled bool
certManagerEnabled bool
maxKafkaTopicConcurrentReconciles int
healthProbesAddr string
)

flag.StringVar(&namespaces, "namespaces", "", "Comma separated list of namespaces where operator listens for resources")
Expand All @@ -103,6 +105,7 @@ func main() {
flag.BoolVar(&certManagerEnabled, "cert-manager-enabled", false, "Enable cert-manager integration")
flag.BoolVar(&certSigningDisabled, "disable-cert-signing-support", false, "Disable native certificate signing integration")
flag.IntVar(&maxKafkaTopicConcurrentReconciles, "max-kafka-topic-concurrent-reconciles", 10, "Define max amount of concurrent KafkaTopic reconciles")
flag.StringVar(&healthProbesAddr, "health-probes-addr", ":8081", "The address the probe endpoint binds to.")
flag.Parse()
ctrl.SetLogger(util.CreateLogger(verboseLogging, developmentLogging))

Expand All @@ -125,20 +128,30 @@ func main() {
}

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "controller-leader-election-helper",
NewCache: managerWatchCacheBuilder,
Port: webhookServerPort,
CertDir: webhookCertDir,
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "controller-leader-election-helper",
NewCache: managerWatchCacheBuilder,
Port: webhookServerPort,
CertDir: webhookCertDir,
HealthProbeBindAddress: healthProbesAddr,
})

if err != nil {
setupLog.Error(err, "unable to start manager")
os.Exit(1)
}

if err = mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to start /healthz endpoint")
os.Exit(1)
}

if err = mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to start /readyz endpoint")
os.Exit(1)
}

if err := certv1.AddToScheme(mgr.GetScheme()); err != nil {
setupLog.Error(err, "")
os.Exit(1)
Expand Down

0 comments on commit a6f7678

Please sign in to comment.