From 6eb4284631553a5b8e9e9bc671bed3fe06b480f5 Mon Sep 17 00:00:00 2001 From: John McCann Cunniff Jr Date: Sun, 10 Sep 2023 20:51:28 -0400 Subject: [PATCH] ADD infra alerter job --- api/anubis/jobs/infra_poller.py | 17 ++++++++ api/anubis/jobs/infra_warner.py | 62 ---------------------------- api/anubis/lms/theia.py | 41 ++++++++++++++++++ api/anubis/utils/email/event.py | 1 + k8s/chart/templates/infra-poller.yml | 48 +++++++++++++++++++++ k8s/debug/provision.sh | 5 ++- 6 files changed, 111 insertions(+), 63 deletions(-) create mode 100644 api/anubis/jobs/infra_poller.py delete mode 100644 api/anubis/jobs/infra_warner.py create mode 100644 k8s/chart/templates/infra-poller.yml diff --git a/api/anubis/jobs/infra_poller.py b/api/anubis/jobs/infra_poller.py new file mode 100644 index 000000000..bd645f425 --- /dev/null +++ b/api/anubis/jobs/infra_poller.py @@ -0,0 +1,17 @@ +import os +import time + +if 'SENTRY_DSN' in os.environ: + del os.environ['SENTRY_DSN'] + +from anubis.lms.theia import check_cluster_ides + + +def main(): + while True: + check_cluster_ides() + time.sleep(1) + + +if __name__ == "__main__": + main() diff --git a/api/anubis/jobs/infra_warner.py b/api/anubis/jobs/infra_warner.py deleted file mode 100644 index 7b5002786..000000000 --- a/api/anubis/jobs/infra_warner.py +++ /dev/null @@ -1,62 +0,0 @@ -import json -import os -import time - -if 'SENTRY_DSN' in os.environ: - del os.environ['SENTRY_DSN'] - -from anubis.lms.theia import get_active_theia_sessions -from anubis.utils.data import with_context -from datetime import datetime, timedelta -from kubernetes import config -from anubis.utils.email.event import send_email_event_admin -from anubis.utils.discord.webhook import send_webhook - - -@with_context -def check_cluster_ides(): - sessions = get_active_theia_sessions() - now = datetime.now() - - for session in sessions: - # Check for age - age = now - session.created - - # Check if it is old - old = age > timedelta(minutes=2) - - # Check for running - running = session.state == 'Running' - - # Check for - if old and not running: - reference_id = 'idewarning' - state = session.state - - # Send email warning - send_email_event_admin( - reference_id, - reference_id, - reference_id, - context={ - 'age': age, - 'now': datetime.now(), - 'session': json.dumps(session.data, indent=2), - 'state': state, - } - ) - - # Send webhook - send_webhook(f'Failed to start IDE within time :: {state}') - - -def main(): - config.load_incluster_config() - - while True: - check_cluster_ides() - time.sleep(1) - - -if __name__ == "__main__": - main() diff --git a/api/anubis/lms/theia.py b/api/anubis/lms/theia.py index 87790c5d7..cc55d34a7 100644 --- a/api/anubis/lms/theia.py +++ b/api/anubis/lms/theia.py @@ -1,7 +1,11 @@ +import json from datetime import datetime, timedelta from anubis.models import TheiaSession from anubis.utils.config import get_config_int +from anubis.utils.data import with_context +from anubis.utils.discord.webhook import send_webhook +from anubis.utils.email.event import send_email_event_admin def get_active_theia_sessions() -> list[TheiaSession]: @@ -15,3 +19,40 @@ def get_active_theia_sessions() -> list[TheiaSession]: ).all() return theia_sessions + + +@with_context +def check_cluster_ides(): + sessions = get_active_theia_sessions() + now = datetime.now() + + for session in sessions: + # Check for age + age = now - session.created + + # Check if it is old + old = age > timedelta(minutes=2) + + # Check for running + running = session.state == 'Running' + + # Check for + if old and not running: + reference_id = 'ide_warning' + state = session.state + + # Send email warning + send_email_event_admin( + reference_id, + reference_id, + reference_id, + context={ + 'age': age, + 'now': datetime.now(), + 'session': json.dumps(session.data, indent=2), + 'state': state, + } + ) + + # Send webhook + send_webhook(f'Failed to start IDE within time :: {state}') diff --git a/api/anubis/utils/email/event.py b/api/anubis/utils/email/event.py index 944c84f24..1b4c24386 100644 --- a/api/anubis/utils/email/event.py +++ b/api/anubis/utils/email/event.py @@ -77,6 +77,7 @@ def send_email_event( # Send email try: success = send_message(message) is not False + # logger.info(f'Sent email {message}') except Error as e: logger.error(f'Failed to send email!\nerror={e}\n\n{traceback.format_exc()}\nemail={message}') return diff --git a/k8s/chart/templates/infra-poller.yml b/k8s/chart/templates/infra-poller.yml new file mode 100644 index 000000000..1967b98d3 --- /dev/null +++ b/k8s/chart/templates/infra-poller.yml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "chart.fullname" . }}-infra-poller + labels: + {{- include "chart.labels" . | nindent 4 }} + component: infra-poller +spec: + replicas: {{- if not .Values.offSemester }} {{ .Values.theia.poller.replicas }}{{- else }} 1{{- end }} + {{- if .Values.rollingUpdates }} + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + {{- end }} + selector: + matchLabels: + {{- include "chart.selectorLabels" . | nindent 6 }} + component: infra-poller + template: + metadata: + labels: + {{- include "chart.selectorLabels" . | nindent 8 }} + component: infra-poller + spec: + {{- if and .Values.nodeSelector (not .Values.debug) }} + nodeSelector: + {{ .Values.nodeSelector | toYaml }} + {{- end }} + serviceAccountName: theia-poller + containers: + - name: poller + image: "{{ .Values.api.image }}:{{ .Values.tag }}" + imagePullPolicy: {{ .Values.imagePullPolicy }} + args: ["python3", "/opt/app/anubis/jobs/infra_poller.py"] + {{- if not .Values.debug}} + resources: + requests: + cpu: 200m + memory: 250Mi + limits: + cpu: 1000m + memory: 500Mi + {{- end }} + env: + {{- include "api.env" . | nindent 8 }} + diff --git a/k8s/debug/provision.sh b/k8s/debug/provision.sh index 15d8bd7c3..7f4dc71ee 100755 --- a/k8s/debug/provision.sh +++ b/k8s/debug/provision.sh @@ -85,11 +85,13 @@ kubectl create namespace anubis kubectl config set-context --current --namespace=anubis # Create a minimal mariadb deployment in a mariadb namespace. On -# prod, the mariadb is in a seperate namespace, so we do the same +# prod, the mariadb is in a separate namespace, so we do the same # here. echo 'Adding mariadb' helm upgrade --install mariadb bitnami/mariadb \ --set 'fullnameOverride=mariadb' \ + --set 'image.repository=bitnami/mariadb' \ + --set 'image.tag=10.6' \ --set 'auth.rootPassword=anubis' \ --set 'volumePermissions.enabled=true' \ --set 'auth.username=anubis' \ @@ -114,6 +116,7 @@ kubectl create secret generic api \ --from-literal=database-port=3306 \ --from-literal=redis-password=anubis \ --from-literal=discord-bot-token=anubis \ + --from-literal=discord-webhook=anubis \ --from-literal=secret-key=DEBUG \ --from-literal=sentry-dsn='' \ --namespace anubis