diff --git a/.gitignore b/.gitignore index 470f8c225..343602639 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ make.sh .envrc *.DS_Store *.DS_Store? +.vscode/ diff --git a/instruqt/troubleshoot-training/01-introduction/assignment.md b/instruqt/troubleshoot-training/01-introduction/assignment.md new file mode 100755 index 000000000..03ddda094 --- /dev/null +++ b/instruqt/troubleshoot-training/01-introduction/assignment.md @@ -0,0 +1,50 @@ +--- +slug: introduction +id: b5dftki3524w +type: challenge +title: Introduction +teaser: Practical Application of Support Bundles and Analyzers +notes: +- type: text + contents: In this track, we'll work together to apply some practical methods for + troubleshooting some Kubernetes problems using Replicated tooling. +tabs: +- title: Workstation + type: terminal + hostname: cloud-client +difficulty: intermediate +timelimit: 600 +--- + +👋 Introduction +=============== + +* **What you will do**: + * Learn to troubleshoot application & cluster problems +* **Who this is for**: + * This track is for anyone who will build KOTS applications **plus** anyone user-facing who support these applications: + * Full Stack / DevOps / Product Engineers + * Support Engineers + * Implementation / Field Engineers + * Success / Sales Engineers +* **Prerequisites**: + * Basic working knowledge of Linux and the `bash` shell +* **Outcomes**: + * You will be able to determine if the problem is in your application, in Kubernetes, or in the infrastructure environment + * You will reduce escalations and expedite time to remediation for such issues + +# Configure the VM environment + +## Set up the Workstation + +The environment is prepped for an *embedded cluster* installation. + +### Configure your editor + +Before we begin, let's choose an editor. The default editor is `nano`, but if you'd like to use `vim` instead, you can switch to it by running the following command and selecting option `2`: + +```bash +update-alternatives --config editor +``` + +Press **Check** when you're ready to begin. diff --git a/instruqt/troubleshoot-training/01-introduction/check-cloud-client b/instruqt/troubleshoot-training/01-introduction/check-cloud-client new file mode 100755 index 000000000..c52d3c26b --- /dev/null +++ b/instruqt/troubleshoot-training/01-introduction/check-cloud-client @@ -0,0 +1,3 @@ +#!/bin/sh + +exit 0 diff --git a/instruqt/troubleshoot-training/01-introduction/solve-cloud-client b/instruqt/troubleshoot-training/01-introduction/solve-cloud-client new file mode 100755 index 000000000..c52d3c26b --- /dev/null +++ b/instruqt/troubleshoot-training/01-introduction/solve-cloud-client @@ -0,0 +1,3 @@ +#!/bin/sh + +exit 0 diff --git a/instruqt/troubleshoot-training/02-troubleshoot-1/assignment.md b/instruqt/troubleshoot-training/02-troubleshoot-1/assignment.md new file mode 100755 index 000000000..b07ee519e --- /dev/null +++ b/instruqt/troubleshoot-training/02-troubleshoot-1/assignment.md @@ -0,0 +1,65 @@ +--- +slug: troubleshoot-1 +id: araxpgiqal1r +type: challenge +title: Where are my pods? +teaser: "\U0001F914" +notes: +- type: text + contents: The website is down +tabs: +- title: Workstation + type: terminal + hostname: cloud-client +difficulty: basic +timelimit: 3600 +--- +Let's imagine that our environment belongs to a customer, who are now experiencing an issue with their install. + +They've raised a rather unclear issue to your support team suggesting that the application "doesn't work" after one of their users accientally made a change from the command line. + +They've shared a support bundle with you, and you've been asked to help investigate. + +Let's use the `sbctl` tool to inspect the support bundle and try to determine what's amiss. `sbctl` should already be installed and the customer's support bundle should be in your home folder. `sbctl` simulates having access to the customer's environment, but all of the data is taken from the support bundle. It lets us use the familiar `kubectl` tool to explore the customer's environment, even without direct access. + +When you've identified the problem, write out the commmand you would use to resolve the problem into a file at `/root/solution.txt` + +The answer should be one line, on the first line of the file. + +(The file does not exist, you will have to create it with your preferred text editor.) + +💡 Using `sbctl` +================= + +- Try `sbctl help` to see what commands are available + +💡 Hints +================= + +- Try the interactive shell prompt using `sbctl` and make sure to provide the path to the support bundle in your home folder + +- How are applications deployed in kubernetes? + +- What controls a pod's lifecycle? + +💡 More Hints +================= + +- How do I see deployments? + +Troubleshooting Procedure +================= + +Identify the problematic deployment from `kubectl get deployments -n `. Notice any pods that have 0 replicas, but should have 1 or more. + +✔️ Solution +================== + +A deployment has been scaled to 0 + +🛠️ Remediation +================= + +```bash +kubectl scale deployment --replicas=1 +``` diff --git a/instruqt/troubleshoot-training/02-troubleshoot-1/check-cloud-client b/instruqt/troubleshoot-training/02-troubleshoot-1/check-cloud-client new file mode 100755 index 000000000..ae2bfb35f --- /dev/null +++ b/instruqt/troubleshoot-training/02-troubleshoot-1/check-cloud-client @@ -0,0 +1,23 @@ +#!/bin/bash +# +# This script runs when the platform check the challenge. +# +# The platform determines if the script was successful using the exit code of this +# script. If the exit code is not 0, the script fails. + +if [[ ! -f "/root/solution.txt" ]]; then + fail-message "solution.txt not found, please create it and write your answer within" + exit 1 +fi + +solution=$(head -n1 "/root/solution.txt" | sed 's/=/ /g' | sed -e 's/--namespace\ default//g' -e 's/-n\ default//g' | sed -re 's/^[[:blank:]]+|[[:blank:]]+$//g' -e 's/[[:blank:]]+/ /g' ) + +echo "solution: $solution" +echo "wanted : kubectl scale deployment frontend --replicas 1" + +if [[ "$solution" = "kubectl scale deployment frontend --replicas 1" ]]; then + exit 0 +fi + +fail-message "oops, your solution doesn't quite look correct, try again!" +exit 1 diff --git a/instruqt/troubleshoot-training/02-troubleshoot-1/setup-cloud-client b/instruqt/troubleshoot-training/02-troubleshoot-1/setup-cloud-client new file mode 100755 index 000000000..5f2a07e29 --- /dev/null +++ b/instruqt/troubleshoot-training/02-troubleshoot-1/setup-cloud-client @@ -0,0 +1,3 @@ +#!/bin/bash + +curl https://spooky.academy/support_bundles/troubleshoot_1_support_bundle.tar.gz -o support-bundle.tar.gz diff --git a/instruqt/troubleshoot-training/02-troubleshoot-1/solve-cloud-client b/instruqt/troubleshoot-training/02-troubleshoot-1/solve-cloud-client new file mode 100755 index 000000000..7abac6cbb --- /dev/null +++ b/instruqt/troubleshoot-training/02-troubleshoot-1/solve-cloud-client @@ -0,0 +1,4 @@ +#!/bin/bash + +rm -rf /root/support-bundle* +rm -rf /root/solution* diff --git a/instruqt/troubleshoot-training/03-troubleshoot-2/assignment.md b/instruqt/troubleshoot-training/03-troubleshoot-2/assignment.md new file mode 100755 index 000000000..74d977330 --- /dev/null +++ b/instruqt/troubleshoot-training/03-troubleshoot-2/assignment.md @@ -0,0 +1,100 @@ +--- +slug: troubleshoot-2 +id: gzv8orjeqdcg +type: challenge +title: CrashLoopBackOff +teaser: "\U0001F648" +notes: +- type: text + contents: Time to fix another problem... +tabs: +- title: Workstation + type: terminal + hostname: cloud-client +difficulty: intermediate +timelimit: 3600 +--- +The customer opens another issue, but this time pods seem to be crashing. + +Let's investigate our app and see if we can identify the issue. Again, we'll use `sbctl` to explore the support bundle. + +To pass this challenge, find the faulty resource, save the YAML spec for that resource to `~/solution.yaml`, correct the problem in the resource, then click "Next" to check your work. + +💡 Using `sbctl` +================= + +- Remember that you can use the interactive shell prompt with `sbctl shell -s ` + +💡 Using `kubectl` +================= + +- How do you make `kubectl` print output in YAML format? + -- What if you wanted to save that output to a file? + +💡 Hints +================= + +- How do you list pods? + +- How do you describe pods? + - What if you wanted to see data from multiple pods at once? + +- How do you get logs from a pod? + - What if you wanted to see a previous version of the pod's logs? + +- When would you look at `describe` output vs. gathering pod logs? + +- Review the [Kubernetes documentation on debugging Pods](https://kubernetes.io/docs/tasks/debug/debug-application/debug-running-pod/) + +💡 More Hints +================= + +- How do you find the exit code of a Pod? + +- What could it mean if a Pod is exiting before it has a chance to emit any logs? + +- Review the Linux exit code conventions: `0` means the process exited normally, `1`-`127` generally mean that the process exited because of a crash or error, and >`128` generally means that the process was killed by a signal (think Ctrl-C or the `kill` command). + +Troubleshooting Procedure +================= + +Identify the problematic Pod from `kubectl get pods -n `. Notice any pods that are not in the Running state. + +Describe the current state of the Pod with `kubectl describe pod -n `. Here are some things to look out for: + +- each Container's current **State** and **Reason** +- each Container's **Last State** and **Reason** +- the Last State's **Exit Code** +- each Container's **Ready** status +- the **Events** table + +For a Pod that is crashing, expect that the current state will be `Waiting`, `Terminated` or `Error`, and the last state will probably also be `Terminated`. Notice the reason for the termination, and especially notice the exit code. There are standards for the exit code originally set by the `chroot` standards, but they are not strictly enforced since applications can always set their own exit codes. + +In short, if the exit code is >128, then the application exited as a result of Kubernetes killing the Pod. If that's the case, you'll commonly see code 137 or 143, which is 128 + the value of the `kill` signal sent to the container. + +If the exit code is <128, then the application crashed or exited abnormally. If the exit code is 0, then the application exited normally (most commonly seen in init containers or Jobs/CronJobs) + +Look for any Events that may indicate a problem. Events by default last 1 hour, unless they occur repeatedly. Events in a repetition loop are especially noteworthy: + +```plaintext +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning BackOff 2d19h (x9075 over 4d4h) kubelet Back-off restarting failed container sentry-workers in pod sentry-worker-696456b57c-twpj7_default(82eb1dde-2987-4f58-af64-883470ffcb58) +``` + +Another way to get even more information about a pod is to use the `-o yaml` option with `kubectl get pods`. This will output the entire pod definition in YAML format. This is useful for debugging issues with the pod definition itself. Here you will see some info that isn't present in `describe pods`, such as Annotations, Tolerations, restart policy, ports, and volumes. + +✔️ Solution +================= + +One of the deployments has a memory limit that is too low for the Pod to run successfully. + +🛠️ Remediation +================= + +Write the YAML spec for the affected deployment into a file at `~/solution.yaml`, then increase the memory limit for the Pod to a reasonable amount. You may have to make an educated guess about what the correct memory limit should be. + +To think about: + +- How can we make sure that this doesn't happen again? diff --git a/instruqt/troubleshoot-training/03-troubleshoot-2/check-cloud-client b/instruqt/troubleshoot-training/03-troubleshoot-2/check-cloud-client new file mode 100755 index 000000000..6a3713cd4 --- /dev/null +++ b/instruqt/troubleshoot-training/03-troubleshoot-2/check-cloud-client @@ -0,0 +1,29 @@ +#!/bin/bash + +if [[ ! -f /root/solution.yaml ]]; then + fail-message "solution.yaml not found in /root/, please create the file and try again" + exit 1 +fi + +kind=$(yq -r '.kind' /root/solution.yaml) + +if [[ ! "$kind" = "Deployment" ]]; then + fail-message "your solution doesn't look correct, you appear to have saved a resource that we weren't expecting" + exit 1 +fi + +limits=$(yq '.spec.template.spec.containers[0].resources.limits.memory' solution.yaml -r) + +if [[ "$limits" = "5M" ]]; then + fail-message "it looks like your solution is incorrect" + echo "limits = 5M" + exit 1 +fi + +rawSize=$(humanfriendly --parse-size "$limits") + +if [[ ! "$rawSize" -gt "5000000" ]];then + fail-message "it looks like your solution is incorrect" + echo "limits < 5M" + exit 1 +fi diff --git a/instruqt/troubleshoot-training/03-troubleshoot-2/setup-cloud-client b/instruqt/troubleshoot-training/03-troubleshoot-2/setup-cloud-client new file mode 100755 index 000000000..0f658c2e7 --- /dev/null +++ b/instruqt/troubleshoot-training/03-troubleshoot-2/setup-cloud-client @@ -0,0 +1,7 @@ +#!/bin/bash + +set -euxo +rm -rf /root/support-bundle* || true +rm /root/solution.txt || true + +curl https://spooky.academy/support_bundles/troubleshoot_2_support_bundle.tar.gz -o support-bundle.tar.gz diff --git a/instruqt/troubleshoot-training/03-troubleshoot-2/solve-cloud-client b/instruqt/troubleshoot-training/03-troubleshoot-2/solve-cloud-client new file mode 100755 index 000000000..7abac6cbb --- /dev/null +++ b/instruqt/troubleshoot-training/03-troubleshoot-2/solve-cloud-client @@ -0,0 +1,4 @@ +#!/bin/bash + +rm -rf /root/support-bundle* +rm -rf /root/solution* diff --git a/instruqt/troubleshoot-training/04-troubleshoot-3/assignment.md b/instruqt/troubleshoot-training/04-troubleshoot-3/assignment.md new file mode 100755 index 000000000..5dc1eccb6 --- /dev/null +++ b/instruqt/troubleshoot-training/04-troubleshoot-3/assignment.md @@ -0,0 +1,148 @@ +--- +slug: troubleshoot-3 +id: ppjhbjbxavp6 +type: challenge +title: Can you hear me major tom? +teaser: "\U0001F680" +notes: +- type: text + contents: Time to fix the problem... +tabs: +- title: Workstation + type: terminal + hostname: cloud-client +difficulty: advanced +timelimit: 3600 +--- +You get another report from a customer saying that their application isn't working, as if **a Pod is not responding and connections time out**. How would you begin to solve the problem? + +Here are some prompts to get you thinking in the right direction: + +- What cluster component handles communication between Pods? +- What cluster component handles load balancing for Pods and exposes Pods to connections from outside the cluster? + +To pass this challenge, save the broken resource to `~/solution.yaml`, edit it to be correct, then click "Next" to check your work. + +💡 Using `sbctl` +================= + +- Remember that you can use the interactive shell prompt with `sbctl shell -s ` + +💡 Using `kubectl` +================= + +- Remember that you can get a resource in YAML format by doing `kubectl get -o yaml` +- Also remember that you can save the output of a command to a file with the `>` operator: `kubectl get -o yaml > ~/resource.yaml` + +- Remember that you can ask for more than one kind of resource at a time by listing them with commas: `kubectl get pods,services,deployments` + +- Remember that you can ask for resources from all namespaces with the `--all-namespaces, -A` flag: `kubectl get pods -A` + +- Remember that you can show extra information, like IP addresses and which node a resource is scheduled on, with the `-o wide` flag: `kubectl get pods -A -o wide` + + +💡 Hints +================= + +- The Kubernetes documentation has a [great manual on debugging Services](https://kubernetes.io/docs/tasks/debug/debug-application/debug-service/) + +- Think about the traffic flow to a typical application: + - There are multiple **hops** in the network path, and any of them _could_ be a potential break in the path. + - Which hops can you identify? + +- How does traffic get to workloads inside kubernetes +- How does Kubernetes handle DNS resolution and load balancing for Pods? + +💡 More Hints +================= + +- A typical app usually has at least a frontend and a backend, and maybe some middle services. The frontend is usually expected to handle incoming requests. The backend is usually expected to handle requests *only* from the frontend or other services. + +- Some apps do not always crash if they can't connect to an API they depend on. It may not be obvious just by looking at the Pod states that there is a problem. Make sure to examine Pod logs in the app for any errors or warnings. + + +💡 More Hints about Services +================= + +- Kubernetes Services act as load balancers to Pods + - Pods advertise a `containerPort` that they are listening on, but we don't want to keep track of their IP addresses since they change all the time. Services are a way to abstract away the IP addresses of the Pods, and instead use a DNS name to connect to the Pods. A Service's name is recorded as a DNS record in the cluster. + - Services advertise a listening `port` and forward connections to a `targetPort`. + + ![Services at a glance](../assets/services-explained.png) + +Troubleshooting Procedure +================= + +First, let's consider what's deployed in the cluster and outline some assumptions. Use `kubectl get pods,deployments,services,ingresses -A -o wide` to list the resources involved in our problem. We want to include all namespaces because there could be components like an Ingress Controller in a different namespace than our application. We can see that Deployments are running, Services exist, and we can match Service names to Deployment names and make some assumptions about the network paths. Notice, also, that there is a single Ingress route that matches all hostnames, and an Ingress class called `contour` and an associated `projectcontour` Namespace with Pods. We need to consider that the Contour Ingress Controller is a potential hop in the network path. This is a common pattern for a Replicated Embedded Cluster installation. + + + + +#### Understand the limits of the problem +Let's consider all the hops in our system. We know there is a Pod, and a Service, and potentially an IngressController pod; troubleshooting any networking problem begins by figuring out the network path from the client to the server,. At each step, we can check our connection with something like `netcat` or `curl` depending on what kind of protocol we are using. Perhaps the web frontend is not working, or perhaps a backend API is not working; start to understand the limits of the problem, and work backwards from Pods that are no longer responding to the client. + +List all the pods and check for any that are not `Running` with `kubectl get pods -n -o wide`. If there are any Pods that are not `Running`, check the logs with `kubectl logs -n `. If all the Pods are healthy, we'll check logs anyway because not every application will crash on a broken connection - many will continue to run, but emit errors. + +As you explore the app's logs, you should notice some error messages with patterns like "connection error" and "transport: Error while dialing:". These should stand out as red flags. Note the IP address and port number involved in the failed connection. + +Return to the list of resources involved in networking by doing `kubectl get pods,deployments,services,ingresses -A -o wide`. Cross-reference the failed connection IP address and port with the resources in the cluster. What do you find that matches? + +You can make this a little bit easier by using `grep`: `kubectl get pods,deployments,services,ingresses -A -o wide | grep `. This will show you only the resources that match the IP address you are looking for. + +Since this application has some clearly defined roles and names for each microservice, it should be easy to identify the Deployment and Pods that are not responding. Not all applications may be so clear, so let's also learn to use the Service's `selector` since, that's [how the Kubernetes model works](https://kubernetes.io/docs/tutorials/services/connect-applications-service/#the-kubernetes-model-for-connecting-containers). + +Use `kubectl describe svc` and `kubectl describe deployment` and note the `selector` and `labels` of the Service and Deployment. The `selector` of the Service should match the `labels` of the Pod spec inside the Deployment. If the `selector` does not match the `labels`, the Service will not forward connections to the Pod correctly. Example with only the relevant fields shown: + +```plaintext +root@cloud-client:~# kubectl describe deployment kotsadm +Name: kotsadm +Namespace: default +Pod Template: + Labels: app=kotsadm + +root@cloud-client:~# kubectl describe service kotsadm +Name: kotsadm +Namespace: default +Selector: app=kotsadm +``` + +You should be able to identify the faulty Service now. If you can't, remember that in a typical application, the frontend or "web tier" is usually expected to route connections between microservices. Review the logs of the frontend to see if it is handling connections correctly. + +Now that you can identify the faulty Service, we need to understand what went wrong and how to fix it. + +We can see the Service's **Type**, which tells us how the service is exposed; a `ClusterIP` service is only accessible within the cluster by other Pods. + +Describe the Service with `kubectl describe svc`, and note the its listening `port` and the `targetPort` it forwards connections to. The `targetPort` should match the `containerPort` of associated Pods. If the Service's `targetPort` does not match a Pod's `containerPort`, the Service will not forward connections to the Pod correctly. + +✔️ Solution +================= + +The problem in this challenge is that a Service is not correctly forwarding connections to a Pod in a Deployment. The Service is forwarding connections to a port where the Pod is not listening. We can see from Pod logs that other microservices expect to use port 7070 to communicate with the `cartservice`, and `cartservice` should forward connections to port 7070 on the Pod. The Service's `targetPort` should match the Pod's `containerPort`. + +Remember that you can think of a Service like a traditional "load balancer" or "reverse proxy" that forwards connections to the correct Pod. The Service listens on a `port` and forwards connections to a `targetPort` on the Pod with a matching selector. The `targetPort` in a Service should be set to the `containerPort` in the Pod spec: + + +```plaintext +# kubectl describe service cartservice +Name: cartservice +Namespace: default +Selector: app=cartservice +... +Port: grpc 7070/TCP +TargetPort: 30000/TCP + +# kubectl describe deployment cartservice +Name: cartservice +Namespace: default +Pod Template: + Labels: app=cartservice + Containers: + server: + Image: gcr.io/google-samples/microservices-demo/cartservice:v0.9.0 + Port: 7070/TCP +``` + +Remediation +================= + +Patch or edit the affected service to correct the port number. You may have to refer to the other resources in the cluster to identify the correct port number. diff --git a/instruqt/troubleshoot-training/04-troubleshoot-3/check-cloud-client b/instruqt/troubleshoot-training/04-troubleshoot-3/check-cloud-client new file mode 100755 index 000000000..834becb86 --- /dev/null +++ b/instruqt/troubleshoot-training/04-troubleshoot-3/check-cloud-client @@ -0,0 +1,20 @@ +#!/bin/bash + +if [[ ! -f /root/solution.yaml ]]; then + fail-message "solution.yaml not found in /root/, please create the file and try again" + exit 1 +fi + +kind=$(yq -r '.kind' /root/solution.yaml) + +if [[ ! "$kind" = "Service" ]]; then + fail-message "your solution doesn't look correct, you appear to have saved a resource that we weren't expecting" + exit 1 +fi + +targetPort=$(yq '.spec.ports[0].targetPort' solution.yaml) + +if [[ "$targetPort" != "7070" ]]; then + fail-message "it looks like your solution is incorrect" + exit 1 +fi diff --git a/instruqt/troubleshoot-training/04-troubleshoot-3/setup-cloud-client b/instruqt/troubleshoot-training/04-troubleshoot-3/setup-cloud-client new file mode 100755 index 000000000..f8a6fc17b --- /dev/null +++ b/instruqt/troubleshoot-training/04-troubleshoot-3/setup-cloud-client @@ -0,0 +1,6 @@ +#!/bin/bash + +rm /root/solution* || true +rm /root/support-bundle* || true + +curl https://spooky.academy/support_bundles/troubleshoot_3_support_bundle.tar.gz -o support-bundle.tar.gz diff --git a/instruqt/troubleshoot-training/04-troubleshoot-3/solve-cloud-client b/instruqt/troubleshoot-training/04-troubleshoot-3/solve-cloud-client new file mode 100755 index 000000000..8f7134623 --- /dev/null +++ b/instruqt/troubleshoot-training/04-troubleshoot-3/solve-cloud-client @@ -0,0 +1,3 @@ +#!/bin/bash +rm -rf /root/support-bundle* +rm -rf /root/solution* diff --git a/instruqt/troubleshoot-training/05-troubleshoot-4/assignment.md b/instruqt/troubleshoot-training/05-troubleshoot-4/assignment.md new file mode 100755 index 000000000..4ce2b10a7 --- /dev/null +++ b/instruqt/troubleshoot-training/05-troubleshoot-4/assignment.md @@ -0,0 +1,62 @@ +--- +slug: troubleshoot-4 +id: vccuaq9got6x +type: challenge +title: Everything is broke... +teaser: "\U0001F527" +notes: +- type: text + contents: Time to fix the problem... +tabs: +- title: Workstation + type: terminal + hostname: cloud-client +difficulty: intermediate +timelimit: 3600 +--- +You get a new report from a customer saying that many pods are failing; some may display Errors, while others may be Evicted or Pending, or even in an Unknown state. + +They have provided a cluster-down support bundle since they can't get one from the kots admin panel. + +This is a type of support-bundle that collects data from the node itself rather than from inside the cluster. + +Extract the support bundle with + +```run +tar -xvf support-bundle.tar.gz +``` + +Explore it to determine the issue. + +Once you think you have your answer, run: + +``` +quiz +``` + +💡 Hints +================= + +Host support bundles have some key files: +- `analysis.json` contains the analysis results of the support bundle +- `host-collectors/` contains the raw output from the collectors +- `host-collectors/run-host/crictl-logs*` contain logs from important containers + +💡 More Hints +================= + +You can pull warnings from the analysis.json with a simple jq filter: + +```run +cd /root/support-bundle-2024-04-17T09_50_09 +jq '.[] | select(.insight.severity == "warn")' analysis.json +``` + +`host-collectors/run-host/df.txt` shows the output of running `df` on the host + +✔️ Solution +================= + +The node's disk is full, which causes problems in the operating system as well as with Kubernetes. Pod eviction thresholds have been exceeded, causing pods to be evicted and image garbage collection to remove images from the node. + +We have a Troubleshoot spec that looks for particularly large files at https://github.com/replicatedhq/troubleshoot-specs/blob/main/host/resource-contention.yaml#L36 diff --git a/instruqt/troubleshoot-training/05-troubleshoot-4/check-cloud-client b/instruqt/troubleshoot-training/05-troubleshoot-4/check-cloud-client new file mode 100755 index 000000000..fc9112c34 --- /dev/null +++ b/instruqt/troubleshoot-training/05-troubleshoot-4/check-cloud-client @@ -0,0 +1,9 @@ +#!/bin/bash +set -exuo pipefail + +if [[ -f /tmp/challenge_ok ]]; then + exit 0 +fi + +fail-message "you don't appear to have picked the correct answer" +exit 1 diff --git a/instruqt/troubleshoot-training/05-troubleshoot-4/setup-cloud-client b/instruqt/troubleshoot-training/05-troubleshoot-4/setup-cloud-client new file mode 100755 index 000000000..18d012c46 --- /dev/null +++ b/instruqt/troubleshoot-training/05-troubleshoot-4/setup-cloud-client @@ -0,0 +1,42 @@ +#!/bin/bash +set -exuo pipefail + +rm -rf /root/support-bundle* +rm -rf /root/solution* + +curl https://spooky.academy/support_bundles/troubleshoot_4_support_bundle.tar.gz -o support-bundle.tar.gz + +cd /opt/ +tee quiz.go << EOF +package main + +import ( + "fmt" + "os" +) + +func main() { + fmt.Println("Which of the following do you think is the cause of the problem:") + fmt.Println("A) Local host does not resolve") + fmt.Println("B) The container runtime is broken") + fmt.Println("C) The disk on the node is full") + fmt.Println("D) Quantum time calculations backfired in the flux capacitor") + + fmt.Printf("%s: ", "Type a letter") + var answer string + _, err := fmt.Scanln(&answer) + if err != nil { + fmt.Println("oops, something seems to have gone wrong, try re-running me") + fmt.Println(err) + } + if answer == "c" || answer == "C" { + fmt.Println("congratulations!") + os.Create("/tmp/challenge_ok") + } else { + fmt.Println("oops! that doesn't appear to be the correct answer!") + } +} +EOF +go build ./quiz.go +mv quiz /usr/bin/ +rm quiz.go diff --git a/instruqt/troubleshoot-training/05-troubleshoot-4/solve-cloud-client b/instruqt/troubleshoot-training/05-troubleshoot-4/solve-cloud-client new file mode 100755 index 000000000..7abac6cbb --- /dev/null +++ b/instruqt/troubleshoot-training/05-troubleshoot-4/solve-cloud-client @@ -0,0 +1,4 @@ +#!/bin/bash + +rm -rf /root/support-bundle* +rm -rf /root/solution* diff --git a/instruqt/troubleshoot-training/06-troubleshoot-5/assignment.md b/instruqt/troubleshoot-training/06-troubleshoot-5/assignment.md new file mode 100755 index 000000000..73753aed6 --- /dev/null +++ b/instruqt/troubleshoot-training/06-troubleshoot-5/assignment.md @@ -0,0 +1,63 @@ +--- +slug: troubleshoot-5 +id: blomhvawugyh +type: challenge +title: It can't be DNS... +teaser: It's always DNS +notes: +- type: text + contents: Time to fix the problem... +tabs: +- title: Workstation + type: terminal + hostname: cloud-client +difficulty: advanced +timelimit: 3600 +--- +A new issue has been reported saying that there are DNS resolution failures in some Pod logs. + +The customer has provided a support bundle + +How do you begin to troubleshoot the problem? + +once you think you know the answer, run: + +```run +quiz +``` + +💡 Hints +================= + +- How do Pods resolve DNS names? + +- Start by checking for any pods that may be failing or in a crash loop, and have a look at the pod logs. You may want to use the `--previous` flag to see the logs from the previous instance of the Pod. + +- Keep on the lookout for `tcp: lookup : no such host`, `getaddrinfo failed` or `Name or service not found` to confirm DNS resolution failures. + +- Try to determine any patterns that may be present. Does the problem affect a single Pod, multiple Pods, or all Pods? + - Is there a pattern that affects only Pods on a specific Node or Namespace? + +- If the behaviour affects only a single Pod, it might be a good idea to delete the Pod and let Kubernetes recreate it. But, if the problem affects multiple Pods, it's more likely a problem in `coredns` or `kube-dns` itself. + +- Review the [Debugging DNS Resolution](https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/) article from the Kubernetes project. + + +💡 More Hints +================= + +- The DNS service in the cluster can be user-configured. How would a cluster admin customize the DNS configs? + +- The DNS zone for a Kubernetes cluster is expected to be `cluster.local`. The fully-qualified domain name for a Service would be `.svc.cluster.local`. + +- You can verify if queries are being received by `coredns` by [configuring logging](https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/#are-dns-queries-being-received-processed). Enable logging for `coredns` and then send some test queries. What responses are logged on the server side? + +💡 Even More Hints +================= + +- an NXDOMAIN response is returned when a DNS query is made for a name that does not exist in the DNS zone. This is a valid response, so DNS is _working_; if we expect the Kubernetes zone to be `cluster.local`, why are we getting `NXDOMAIN` responses for `cluster.local`? + +✔️ Solution +================= + +The `coredns` deployment has been reconfigured to only answer for a DNS zone of `cluster.nonlocal`. This is causing DNS resolution failures for Pods that are expecting to resolve names in the `cluster.local` zone. diff --git a/instruqt/troubleshoot-training/06-troubleshoot-5/check-cloud-client b/instruqt/troubleshoot-training/06-troubleshoot-5/check-cloud-client new file mode 100755 index 000000000..fc9112c34 --- /dev/null +++ b/instruqt/troubleshoot-training/06-troubleshoot-5/check-cloud-client @@ -0,0 +1,9 @@ +#!/bin/bash +set -exuo pipefail + +if [[ -f /tmp/challenge_ok ]]; then + exit 0 +fi + +fail-message "you don't appear to have picked the correct answer" +exit 1 diff --git a/instruqt/troubleshoot-training/06-troubleshoot-5/setup-cloud-client b/instruqt/troubleshoot-training/06-troubleshoot-5/setup-cloud-client new file mode 100755 index 000000000..b315805fa --- /dev/null +++ b/instruqt/troubleshoot-training/06-troubleshoot-5/setup-cloud-client @@ -0,0 +1,46 @@ +#!/bin/bash +# +# + +rm -rf /root/support-bundle* +rm -rf /root/solution* + +curl https://spooky.academy/support_bundles/troubleshoot_5_support_bundle.tar.gz -o support-bundle.tar.gz + +cd /opt/ +tee quiz.go << EOF +package main + +import ( + "fmt" + "os" +) + +func main() { + fmt.Println("Which of the following do you think is the cause of the problem:") + fmt.Println("A) localhost does not resolve") + fmt.Println("B) There is no internet connection") + fmt.Println("C) The disk on the node is full") + fmt.Println("D) Coredns is misconfigured") + fmt.Println("E) Bogon particles in the mainframes spline array") + + fmt.Printf("%s: ", "Type a letter") + var answer string + _, err := fmt.Scanln(&answer) + if err != nil { + fmt.Println("oops, something seems to have gone wrong, try re-running me") + fmt.Println(err) + } + if answer == "d" || answer == "D" { + fmt.Println("congratulations!") + os.Create("/tmp/challenge_ok") + } else { + fmt.Println("oops! that doesn't appear to be the correct answer!") + } +} +EOF +go build ./quiz.go +rm /usr/bin/quiz +mv quiz /usr/bin/ +rm quiz.go +rm /tmp/challenge_ok diff --git a/instruqt/troubleshoot-training/06-troubleshoot-5/solve-cloud-client b/instruqt/troubleshoot-training/06-troubleshoot-5/solve-cloud-client new file mode 100755 index 000000000..8f7134623 --- /dev/null +++ b/instruqt/troubleshoot-training/06-troubleshoot-5/solve-cloud-client @@ -0,0 +1,3 @@ +#!/bin/bash +rm -rf /root/support-bundle* +rm -rf /root/solution* diff --git a/instruqt/troubleshoot-training/07-support-bundle-types/assignment.md b/instruqt/troubleshoot-training/07-support-bundle-types/assignment.md new file mode 100755 index 000000000..b86944e13 --- /dev/null +++ b/instruqt/troubleshoot-training/07-support-bundle-types/assignment.md @@ -0,0 +1,17 @@ +--- +slug: support-bundle-types +id: 0p5u7u2ecjih +type: quiz +title: support bundle types +teaser: A qick quiz +answers: +- Manual command execution +- Host collector support bundle +- KOTS support bundle +- Log files +solution: +- 1 +difficulty: basic +timelimit: 600 +--- +If a customer's cluster is unresponsive, what is the easiest way to inspect the environment diff --git a/instruqt/troubleshoot-training/08-super-secret/assignment.md b/instruqt/troubleshoot-training/08-super-secret/assignment.md new file mode 100755 index 000000000..08632c6ac --- /dev/null +++ b/instruqt/troubleshoot-training/08-super-secret/assignment.md @@ -0,0 +1,23 @@ +--- +slug: super-secret +id: 3c2bbx2sxndy +type: quiz +title: super secret +teaser: another quiz +answers: +- There's no possiblity of data extraction +- Extract the bundle through a secret backdoor +- Work with the customer to write redactor specs for their environment +- Give up +solution: +- 2 +difficulty: basic +timelimit: 600 +--- +One of your customers is under strict governmental compliance. they have an issue in one of their clusters and inform you that they cannot send a support bundle because it would contain too much sensitive information. + +How would you proceed? + +🙈 Hint +============== +https://troubleshoot.sh/docs/redact/ diff --git a/instruqt/troubleshoot-training/assets/deploy.png b/instruqt/troubleshoot-training/assets/deploy.png new file mode 100644 index 000000000..e905e1612 Binary files /dev/null and b/instruqt/troubleshoot-training/assets/deploy.png differ diff --git a/instruqt/troubleshoot-training/assets/services-explained.png b/instruqt/troubleshoot-training/assets/services-explained.png new file mode 100644 index 000000000..e3e6c91f9 Binary files /dev/null and b/instruqt/troubleshoot-training/assets/services-explained.png differ diff --git a/instruqt/troubleshoot-training/config.yml b/instruqt/troubleshoot-training/config.yml new file mode 100644 index 000000000..1abe82522 --- /dev/null +++ b/instruqt/troubleshoot-training/config.yml @@ -0,0 +1,5 @@ +version: "3" +containers: +- name: cloud-client + image: gcr.io/instruqt/cloud-client + shell: /bin/bash diff --git a/instruqt/troubleshoot-training/track.yml b/instruqt/troubleshoot-training/track.yml new file mode 100755 index 000000000..6266b1fb9 --- /dev/null +++ b/instruqt/troubleshoot-training/track.yml @@ -0,0 +1,22 @@ +slug: troubleshoot-training +id: 3ytyltkwdclm +title: Troubleshoot Training (EC) +teaser: Get hands-on, practical application at debugging problems on Replicated embedded + clusters (aka kURL) +description: |+ + In this track, we invite you to learn how to triage, diagnose, and solve problems with the replicated platform and support tooling. + +icon: https://cdn.instruqt.com/assets/templates/kubernetes.png +level: intermediate +tags: +- instructor-led +owner: replicated +developers: +- danj@replicated.com +lab_config: + overlay: false + width: 33 + position: right + feedback_recap_enabled: true + loadingMessages: true +checksum: "2772022616982256640" diff --git a/instruqt/troubleshoot-training/track_scripts/setup-cloud-client b/instruqt/troubleshoot-training/track_scripts/setup-cloud-client new file mode 100755 index 000000000..bcd35a8d8 --- /dev/null +++ b/instruqt/troubleshoot-training/track_scripts/setup-cloud-client @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# This set line ensures that all failures will cause the script to error and exit +set -euxo pipefail + +# Wait for Instruqt bootstrap to be complete +while [ ! -f /opt/instruqt/bootstrap/host-bootstrap-completed ] +do + echo "Waiting for Instruqt to finish booting the VM" + sleep 1 +done + +DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y jq yq golang bash-completion +pip install humanfriendly + +# install krew +( + set -x; cd "$(mktemp -d)" && + OS="$(uname | tr '[:upper:]' '[:lower:]')" && + ARCH="$(uname -m | sed -e 's/x86_64/amd64/' -e 's/\(arm\)\(64\)\?.*/\1\2/' -e 's/aarch64$/arm64/')" && + KREW="krew-${OS}_${ARCH}" && + curl -fsSLO "https://github.com/kubernetes-sigs/krew/releases/latest/download/${KREW}.tar.gz" && + tar zxvf "${KREW}.tar.gz" && + ./"${KREW}" install krew +) + +export PATH=$PATH:$HOME/.krew/bin +echo "export PATH=$PATH:$HOME/.krew/bin" | tee -a /root/.bashrc +echo "export SHELL=bash" | tee -a /root/.bashrc + +kubectl krew install preflight +kubectl krew install support-bundle + +kubectl completion bash | sudo tee /etc/bash_completion.d/kubectl > /dev/null +chmod a+r /etc/bash_completion.d/kubectl + +# install sbctl +curl -LO https://github.com/replicatedhq/sbctl/releases/latest/download/sbctl_linux_amd64.tar.gz \ + && tar -xzvf sbctl_linux_amd64.tar.gz -C /tmp sbctl \ + && cp /tmp/sbctl /usr/local/bin/sbctl \ + && rm sbctl_linux_amd64.tar.gz + +mkdir /opt/backups