From a6a60dcf563d3c109aed8d359805f202419d56ac Mon Sep 17 00:00:00 2001 From: Stanislas Polu Date: Fri, 27 Oct 2023 15:08:38 +0200 Subject: [PATCH] core: k8s (#2294) * core: k8s * bump grace period to 180s * core: graceful shutdown * github action * update apply_infra --- .github/workflows/deploy-core.yml | 54 ++++++++++++++++++++++++ core/Dockerfile | 12 ++++++ core/bin/dust_api.rs | 62 +++++++++++++++++++++------- core/dockerignore | 11 +++++ k8s/apply_infra.sh | 4 +- k8s/configmaps/core-configmap.yaml | 9 ++++ k8s/deployments/core-deployment.yaml | 54 ++++++++++++++++++++++++ k8s/services/core-service.yaml | 13 ++++++ 8 files changed, 204 insertions(+), 15 deletions(-) create mode 100644 .github/workflows/deploy-core.yml create mode 100644 core/Dockerfile create mode 100644 core/dockerignore create mode 100644 k8s/configmaps/core-configmap.yaml create mode 100644 k8s/deployments/core-deployment.yaml create mode 100644 k8s/services/core-service.yaml diff --git a/.github/workflows/deploy-core.yml b/.github/workflows/deploy-core.yml new file mode 100644 index 000000000000..2c62d70d91aa --- /dev/null +++ b/.github/workflows/deploy-core.yml @@ -0,0 +1,54 @@ +name: Deploy Core + +on: + workflow_dispatch: + +concurrency: + group: deploy_core + cancel-in-progress: false + +env: + GCLOUD_PROJECT_ID: ${{ secrets.GCLOUD_PROJECT_ID }} + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Get short sha + id: short_sha + run: echo "short_sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + + - name: "Authenticate with Google Cloud" + uses: "google-github-actions/auth@v1" + with: + credentials_json: "${{ secrets.GCLOUD_SA_KEY }}" + + - name: "Set up Cloud SDK" + uses: "google-github-actions/setup-gcloud@v1" + + - name: Install gke-gcloud-auth-plugin + run: | + gcloud components install gke-gcloud-auth-plugin + + - name: Setup kubectl + run: | + gcloud container clusters get-credentials dust-kube --region us-central1 + + - name: Build the image on Cloud Build + run: | + chmod +x ./k8s/cloud-build.sh + ./k8s/cloud-build.sh core + + - name: Deploy the image on Kubernetes + run: | + chmod +x ./k8s/deploy-image.sh + ./k8s/deploy-image.sh gcr.io/$GCLOUD_PROJECT_ID/core-image:${{ steps.short_sha.outputs.short_sha }} core-deployment + + - name: Wait for rollout to complete + run: | + echo "Waiting for rollout to complete (web)" + kubectl rollout status deployment/core-deployment --timeout=10m diff --git a/core/Dockerfile b/core/Dockerfile new file mode 100644 index 000000000000..8322b1d61144 --- /dev/null +++ b/core/Dockerfile @@ -0,0 +1,12 @@ +FROM rust:1.70.0 as core + +WORKDIR /app + +COPY . . + +RUN cargo build --release + +EXPOSE 3001 + +# Set a default command, it will start the API service if no command is provided +CMD ["cargo", "run", "--release", "--bin", "dust-api"] \ No newline at end of file diff --git a/core/bin/dust_api.rs b/core/bin/dust_api.rs index 7d114331ae82..94d65bfd49e7 100644 --- a/core/bin/dust_api.rs +++ b/core/bin/dust_api.rs @@ -30,7 +30,10 @@ use serde_json::{json, Value}; use std::collections::{HashMap, HashSet}; use std::convert::Infallible; use std::sync::Arc; -use tokio::sync::mpsc::unbounded_channel; +use tokio::{ + signal::unix::{signal, SignalKind}, + sync::mpsc::unbounded_channel, +}; use tokio_stream::Stream; use tower_http::trace::{self, TraceLayer}; use tracing::Level; @@ -78,6 +81,23 @@ impl APIState { run_manager.pending_apps.push((app, credentials)); } + async fn stop_loop(&self) { + loop { + let pending_runs = { + let manager = self.run_manager.lock(); + utils::info(&format!( + "[GRACEFUL] {} stop_loop pending runs", + manager.pending_runs.len() + )); + manager.pending_runs.len() + }; + if pending_runs == 0 { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(1024)).await; + } + } + async fn run_loop(&self) -> Result<()> { let mut loop_count = 0; loop { @@ -121,8 +141,8 @@ impl APIState { }); }); loop_count += 1; - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - if loop_count % (10 * 10) == 0 { + tokio::time::sleep(std::time::Duration::from_millis(4)).await; + if loop_count % 1024 == 0 { let manager = self.run_manager.lock(); utils::info(&format!("{} pending runs", manager.pending_runs.len())); } @@ -1813,30 +1833,44 @@ fn main() { .layer(extract::Extension(state.clone())); // Start the APIState run loop. - let state = state.clone(); - tokio::task::spawn(async move { state.run_loop().await }); + let runloop_state = state.clone(); + tokio::task::spawn(async move { runloop_state.run_loop().await }); + + let (tx1, rx1) = tokio::sync::oneshot::channel::<()>(); + let (tx2, rx2) = tokio::sync::oneshot::channel::<()>(); - let (tx, rx) = tokio::sync::oneshot::channel(); let srv = axum::Server::bind(&"[::]:3001".parse().unwrap()) .serve(app.into_make_service()) .with_graceful_shutdown(async { - rx.await.ok(); + rx1.await.ok(); }); tokio::spawn(async move { if let Err(e) = srv.await { - eprintln!("server error: {}", e); + utils::error(&format!("server error: {}", e)); } + utils::info("[GRACEFUL] Server stopped"); + tx2.send(()).ok(); }); - // Wait for `ctrl+c` and stop the server - tokio::signal::ctrl_c().await.unwrap(); - println!("Ctrl+C received, stopping server..."); - let _ = tx.send(()); + utils::info(&format!("Current PID: {}", std::process::id())); + + let mut stream = signal(SignalKind::terminate()).unwrap(); + stream.recv().await; + + // Gracefully shut down the server + utils::info("[GRACEFUL] SIGTERM received, stopping server..."); + tx1.send(()).ok(); + + // Wait for the server to shutdown + utils::info("[GRACEFUL] Awaiting server shutdown..."); + rx2.await.ok(); - // Wait for another `ctrl+c` and exit - tokio::signal::ctrl_c().await.unwrap(); + // Wait for the run loop to finish. + utils::info("[GRACEFUL] Awaiting stop loop..."); + state.stop_loop().await; + utils::info("[GRACEFUL] Exiting!"); Ok::<(), anyhow::Error>(()) }); } diff --git a/core/dockerignore b/core/dockerignore new file mode 100644 index 000000000000..42e6e8129b65 --- /dev/null +++ b/core/dockerignore @@ -0,0 +1,11 @@ +target + +# misc +.DS_Store +*.pem + +.env +.env*.local + +Dockerfile* +.dockerignore diff --git a/k8s/apply_infra.sh b/k8s/apply_infra.sh index 2efac3671e4c..b9025c3ec018 100755 --- a/k8s/apply_infra.sh +++ b/k8s/apply_infra.sh @@ -2,7 +2,6 @@ set -e - function apply_deployment { # This function applies a deployment, but if the deployment already exists, # it will replace the image with the current image to avoid a rolling update @@ -57,6 +56,7 @@ kubectl apply -f "$(dirname "$0")/configmaps/connectors-edge-configmap.yaml" kubectl apply -f "$(dirname "$0")/configmaps/blog-configmap.yaml" kubectl apply -f "$(dirname "$0")/configmaps/docs-configmap.yaml" kubectl apply -f "$(dirname "$0")/configmaps/alerting-temporal-configmap.yaml" +kubectl apply -f "$(dirname "$0")/configmaps/core-configmap.yaml" echo "-----------------------------------" echo "Applying backend configs" @@ -100,6 +100,7 @@ apply_deployment blog-deployment apply_deployment docs-deployment apply_deployment metabase-deployment apply_deployment alerting-temporal-deployment +apply_deployment core-deployment echo "-----------------------------------" @@ -114,6 +115,7 @@ kubectl apply -f "$(dirname "$0")/services/connectors-edge-service.yaml" kubectl apply -f "$(dirname "$0")/services/blog-service.yaml" kubectl apply -f "$(dirname "$0")/services/docs-service.yaml" kubectl apply -f "$(dirname "$0")/services/metabase-service.yaml" +kubectl apply -f "$(dirname "$0")/services/core-service.yaml" echo "-----------------------------------" diff --git a/k8s/configmaps/core-configmap.yaml b/k8s/configmaps/core-configmap.yaml new file mode 100644 index 000000000000..cf9296dcb13d --- /dev/null +++ b/k8s/configmaps/core-configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: core-config +data: + DD_ENV: "prod" + DD_SERVICE: "core" + DD_LOGS_INJECTION: "true" + DD_RUNTIME_METRICS_ENABLED: "true" diff --git a/k8s/deployments/core-deployment.yaml b/k8s/deployments/core-deployment.yaml new file mode 100644 index 000000000000..3108a5028518 --- /dev/null +++ b/k8s/deployments/core-deployment.yaml @@ -0,0 +1,54 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: core-deployment +spec: + replicas: 3 + selector: + matchLabels: + app: core + template: + metadata: + labels: + app: core + name: core-pod + admission.datadoghq.com/enabled: "true" + annotations: + ad.datadoghq.com/web.logs: '[{"source": "core","service": "core","tags": ["env:prod"]}]' + spec: + terminationGracePeriodSeconds: 180 + containers: + - name: web + image: gcr.io/or1g1n-186209/core-image:latest + command: ["cargo", "run", "--release", "--bin", "dust-api"] + imagePullPolicy: Always + ports: + - containerPort: 3001 + + envFrom: + - configMapRef: + name: core-config + - secretRef: + name: core-secrets + env: + - name: DD_AGENT_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + + volumeMounts: + - name: service-account-volume + mountPath: /etc/service-accounts + + resources: + requests: + cpu: 1000m + memory: 2.5Gi + limits: + cpu: 1000m + memory: 2.5Gi + + volumes: + - name: service-account-volume + secret: + secretName: gcp-service-account-secret diff --git a/k8s/services/core-service.yaml b/k8s/services/core-service.yaml new file mode 100644 index 000000000000..87a2e19dbdb9 --- /dev/null +++ b/k8s/services/core-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: core-service +spec: + selector: + app: core + name: core-pod + ports: + - protocol: TCP + port: 80 + targetPort: 3001 + type: ClusterIP