From ffd246044cbef2f0dc2f3daee51c067c298ad1f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 09:26:17 +0300
Subject: [PATCH 01/10] Restructured gcp docs a bit

---
 deployment/gcp/README.md | 52 ++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/deployment/gcp/README.md b/deployment/gcp/README.md
index b27bd2a..7619110 100644
--- a/deployment/gcp/README.md
+++ b/deployment/gcp/README.md
@@ -19,7 +19,7 @@ For the remainder of these instructions, you are assumed to be in the `deploymen
 export PROJECT="foo-sandbox"
 ```
 
-## Provision GCP Resources
+## (One time) Provision GCP Resources
 
 ### Configure the GCP Project
 
@@ -50,17 +50,24 @@ gcloud container clusters get-credentials crawl1
 
 This allows subsequent `kubectl` commands to interact with our cluster (using the context `gke_{PROJECT}_{ZONE}_{CLUSTER_NAME}`)
 
-## Build and push Docker image to GCR
+## (Optional) Configure sentry credentials
 
-(Optional) If one of [the pre-built OpenWPM Docker images](https://hub.docker.com/r/openwpm/openwpm/tags) are not sufficient:
+Set the Sentry DSN as a kubectl secret (change `foo` below):
 ```
-cd ../openwpm-crawler/OpenWPM; docker build -t gcr.io/$PROJECT/openwpm .; cd -
-gcloud auth configure-docker
-docker push gcr.io/$PROJECT/openwpm
+kubectl create secret generic sentry-config \
+--from-literal=sentry_dsn=foo
+```
+
+To run crawls without Sentry, remove the following from the crawl config after it has been generated below:
+```
+        - name: SENTRY_DSN
+          valueFrom:
+            secretKeyRef:
+              name: sentry-config
+              key: sentry_dsn
 ```
-Remember to change the `crawl.yaml` to point to `image: gcr.io/$PROJECT/openwpm`.
 
-## Allow the cluster to access AWS S3
+## (One time)  Allow the cluster to access AWS S3
 
 Make sure that your AWS credentials are stored in `~/.aws/credentials` as per:
 
@@ -75,6 +82,16 @@ Then run:
 ./aws_credentials_as_kubectl_secrets.sh
 ```
 
+## Build and push Docker image to GCR
+
+(Optional) If one of [the pre-built OpenWPM Docker images](https://hub.docker.com/r/openwpm/openwpm/tags) are not sufficient:
+```
+cd ../openwpm-crawler/OpenWPM; docker build -t gcr.io/$PROJECT/openwpm .; cd -
+gcloud auth configure-docker
+docker push gcr.io/$PROJECT/openwpm
+```
+Remember to change the `crawl.yaml` to point to `image: gcr.io/$PROJECT/openwpm`.
+
 ## Deploy the redis server which we use for the work queue
 
 ```
@@ -128,23 +145,6 @@ cp crawl.tmpl.yaml crawl.yaml
 
 Note: A useful naming convention for `CRAWL_DIRECTORY` is `YYYY-MM-DD_description_of_the_crawl`.
 
-## (Optional) Configure sentry credentials
-
-Set the Sentry DSN as a kubectl secret (change `foo` below):
-```
-kubectl create secret generic sentry-config \
---from-literal=sentry_dsn=foo
-```
-
-To run crawls without Sentry, remove the following from the crawl config:
-```
-        - name: SENTRY_DSN
-          valueFrom:
-            secretKeyRef:
-              name: sentry-config
-              key: sentry_dsn
-```
-
 ## Start the crawl
 
 When you are ready, deploy the crawl:
@@ -180,7 +180,7 @@ Contents of the queue:
 lrange crawl-queue 0 -1
 ```
 
-#### OpenWPM progress and logs
+#### Crawl progress and logs
 
 Check out the [GCP GKE Console](https://console.cloud.google.com/kubernetes/workload)
 

From 74fb271b8d65ba1d751e5fb7daa0ad62361f0d41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 10:04:54 +0300
Subject: [PATCH 02/10] Start cluster with non-preemtible nodes by default

---
 deployment/gcp/README.md            | 8 +++++++-
 deployment/gcp/start_gke_cluster.sh | 6 +++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/deployment/gcp/README.md b/deployment/gcp/README.md
index 7619110..f78599d 100644
--- a/deployment/gcp/README.md
+++ b/deployment/gcp/README.md
@@ -31,7 +31,7 @@ export PROJECT="foo-sandbox"
 
 ### Setup GKE Cluster
 
-The following command will create a zonal GKE cluster with [preemptible](https://cloud.google.com/preemptible-vms/) [n1-highcpu-16](https://cloud.google.com/compute/all-pricing) nodes ($0.1200/node/h).
+The following command will create a zonal GKE cluster with [n1-highcpu-16](https://cloud.google.com/compute/all-pricing) nodes ($0.5672/node/h).
 
 You may want to adjust fields within `./start_gke_cluster.sh` where appropriate such as:
 - num-nodes, min-nodes, max-nodes
@@ -42,6 +42,12 @@ You may want to adjust fields within `./start_gke_cluster.sh` where appropriate
 ./start_gke_cluster.sh crawl1
 ```
 
+Note: For testing, you can use [preemptible](https://cloud.google.com/preemptible-vms/) nodes ($0.1200/node/h) instead:
+
+```
+./start_gke_cluster.sh crawl1 --preemptible
+```
+
 ### Fetch kubernetes cluster credentials for use with `kubectl`
 
 ```
diff --git a/deployment/gcp/start_gke_cluster.sh b/deployment/gcp/start_gke_cluster.sh
index 18241db..f0a308b 100755
--- a/deployment/gcp/start_gke_cluster.sh
+++ b/deployment/gcp/start_gke_cluster.sh
@@ -2,10 +2,11 @@
 set -e
 
 if [[ $# -lt 1 ]]; then
-    echo "Usage: start_gke_cluster.sh cluster_name" >&2
+    echo "Usage: start_gke_cluster.sh cluster_name additional_args" >&2
     exit 1
 fi
 CLUSTER_NAME=$1
+ADDITIONAL_ARGS="${*:2}"
 
 gcloud container clusters create $CLUSTER_NAME \
 --zone us-central1-f \
@@ -17,5 +18,4 @@ gcloud container clusters create $CLUSTER_NAME \
 --min-nodes=0 \
 --max-nodes=30 \
 --enable-autoscaling \
---min-cpu-platform="Intel Broadwell" \
---preemptible
+--min-cpu-platform="Intel Broadwell" $ADDITIONAL_ARGS

From 9a68482db85cf683f420ce1ab18a841645cb9712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 13:10:26 +0300
Subject: [PATCH 03/10] Use Google Cloud Memorystore for Redis in GCP instead
 of a cluster-deployed redis instance

---
 deployment/gcp/README.md            | 44 ++++++++++++++++++++---------
 deployment/gcp/crawl.tmpl.yaml      |  2 ++
 deployment/gcp/redis.yaml           | 26 -----------------
 deployment/gcp/start_gke_cluster.sh |  1 +
 4 files changed, 34 insertions(+), 39 deletions(-)
 delete mode 100644 deployment/gcp/redis.yaml

diff --git a/deployment/gcp/README.md b/deployment/gcp/README.md
index f78599d..9560d26 100644
--- a/deployment/gcp/README.md
+++ b/deployment/gcp/README.md
@@ -31,7 +31,7 @@ export PROJECT="foo-sandbox"
 
 ### Setup GKE Cluster
 
-The following command will create a zonal GKE cluster with [n1-highcpu-16](https://cloud.google.com/compute/all-pricing) nodes ($0.5672/node/h).
+The following command will create a zonal GKE cluster with [n1-highcpu-16](https://cloud.google.com/compute/all-pricing) nodes ($0.5672/node/h) with [IP-Alias enabled](https://cloud.google.com/kubernetes-engine/docs/how-to/alias-ips#creating_a_new_cluster_with_ip_aliases) (makes it a bit easier to connect to managed Redis instances from the cluster).
 
 You may want to adjust fields within `./start_gke_cluster.sh` where appropriate such as:
 - num-nodes, min-nodes, max-nodes
@@ -100,10 +100,23 @@ Remember to change the `crawl.yaml` to point to `image: gcr.io/$PROJECT/openwpm`
 
 ## Deploy the redis server which we use for the work queue
 
+This will launch a 1GB Basic tier Google Cloud Memorystore for Redis instance ($0.049/GB/hour):
 ```
-kubectl apply -f redis.yaml
+gcloud redis instances create crawlredis --size=1 --region=us-central1 --redis-version=redis_4_0
 ```
 
+Next, use the following output:
+```
+gcloud redis instances describe crawlredis --region=us-central1
+```
+... to set the corresponding env var:
+
+```
+export REDIS_HOST=10.0.0.3
+```
+
+(See https://cloud.google.com/memorystore/docs/redis/connecting-redis-instance for more information.)
+
 ## Adding sites to be crawled to the queue
 
 Create a comma-separated site list as per:
@@ -140,14 +153,14 @@ cd ../../; python -m utilities.get_sampled_sites; cd -
 
 Since each crawl is unique, you need to configure your `crawl.yaml` deployment configuration. We have provided a template to start from:
 ```
-cp crawl.tmpl.yaml crawl.yaml
+envsubst < ./crawl.tmpl.yaml > crawl.yaml
 ```
 
-- Update `crawl.yaml`. This may include:
-    - spec.parallelism
-    - spec.containers.image
-    - spec.containers.env
-    - spec.containers.resources
+Use of `envsubst` has already replaced `$REDIS_HOST` with the value of the env var set previously, but you may still want to adapt `crawl.yaml`:
+- spec.parallelism
+- spec.containers.image
+- spec.containers.env
+- spec.containers.resources
 
 Note: A useful naming convention for `CRAWL_DIRECTORY` is `YYYY-MM-DD_description_of_the_crawl`.
 
@@ -167,8 +180,8 @@ Note that for the remainder of these instructions, `metadata.name` is assumed to
 
 Open a temporary instance and launch redis-cli:
 ```
-kubectl attach temp -c temp -i -t || kubectl run --generator=run-pod/v1 -i --tty temp --image redis --command "/bin/bash"
-redis-cli -h redis
+kubectl attach redisbox -c redisbox -i -t || kubectl run --generator=run-pod/v1 -i --tty redisbox --image=gcr.io/google_containers/redis:v1 --env REDIS_HOST=$REDIS_HOST -- bash
+redis-cli -h $REDIS_HOST
 ```
 
 Current length of the queue:
@@ -186,6 +199,11 @@ Contents of the queue:
 lrange crawl-queue 0 -1
 ```
 
+Note: To re-connect to an already running redis box pod:
+```
+kubectl attach redisbox -c redisbox -i -t
+```
+
 #### Crawl progress and logs
 
 Check out the [GCP GKE Console](https://console.cloud.google.com/kubernetes/workload)
@@ -216,12 +234,12 @@ kubectl describe job openwpm-crawl
 
 The crawl data will end up in Parquet format in the S3 bucket that you configured.
 
-### Clean up created pods, services and local artifacts
+### Clean up created pods, instances and local artifacts
 
 ```
-kubectl delete -f redis.yaml
 kubectl delete -f crawl.yaml
-kubectl delete pod temp
+gcloud redis instances delete crawlredis --region=us-central1
+kubectl delete pod redisbox
 ```
 
 ### Decrease the size of the cluster while it is not in use
diff --git a/deployment/gcp/crawl.tmpl.yaml b/deployment/gcp/crawl.tmpl.yaml
index 6088519..abb43ae 100644
--- a/deployment/gcp/crawl.tmpl.yaml
+++ b/deployment/gcp/crawl.tmpl.yaml
@@ -27,6 +27,8 @@ spec:
               key: aws_secret_access_key
         - name: NUM_BROWSERS
           value: '1'
+        - name: REDIS_HOST
+          value: '$REDIS_HOST'
         - name: REDIS_QUEUE_NAME
           value: 'crawl-queue'
         - name: CRAWL_DIRECTORY
diff --git a/deployment/gcp/redis.yaml b/deployment/gcp/redis.yaml
deleted file mode 100644
index ae5f333..0000000
--- a/deployment/gcp/redis.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: redis-master
-  labels:
-    app: redis
-spec:
-  containers:
-    - name: master
-      image: redis
-      env:
-        - name: MASTER
-          value: "true"
-      ports:
-        - containerPort: 6379
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: redis
-spec:
-  selector:
-    app: redis
-  ports:
-    - port: 6379
-      targetPort: 6379
diff --git a/deployment/gcp/start_gke_cluster.sh b/deployment/gcp/start_gke_cluster.sh
index f0a308b..8cc02ea 100755
--- a/deployment/gcp/start_gke_cluster.sh
+++ b/deployment/gcp/start_gke_cluster.sh
@@ -18,4 +18,5 @@ gcloud container clusters create $CLUSTER_NAME \
 --min-nodes=0 \
 --max-nodes=30 \
 --enable-autoscaling \
+--enable-ip-alias \
 --min-cpu-platform="Intel Broadwell" $ADDITIONAL_ARGS

From d7c3bebf54d185b5e12fb5b1ea32cea03509f9d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 13:53:36 +0300
Subject: [PATCH 04/10] Loading site list into redis via a temporary redis-box
 pod deployed to the cluster

---
 deployment/gcp/README.md                | 11 +++--------
 deployment/gcp/redis-box.yaml           | 10 ++++++++++
 deployment/load_site_list_into_redis.sh | 10 ++++++++--
 3 files changed, 21 insertions(+), 10 deletions(-)
 create mode 100644 deployment/gcp/redis-box.yaml

diff --git a/deployment/gcp/README.md b/deployment/gcp/README.md
index 9560d26..f3b6adf 100644
--- a/deployment/gcp/README.md
+++ b/deployment/gcp/README.md
@@ -180,8 +180,8 @@ Note that for the remainder of these instructions, `metadata.name` is assumed to
 
 Open a temporary instance and launch redis-cli:
 ```
-kubectl attach redisbox -c redisbox -i -t || kubectl run --generator=run-pod/v1 -i --tty redisbox --image=gcr.io/google_containers/redis:v1 --env REDIS_HOST=$REDIS_HOST -- bash
-redis-cli -h $REDIS_HOST
+kubectl apply -f redis-box.yaml
+kubectl exec -it redis-box -- sh -c "redis-cli -h $REDIS_HOST"
 ```
 
 Current length of the queue:
@@ -199,11 +199,6 @@ Contents of the queue:
 lrange crawl-queue 0 -1
 ```
 
-Note: To re-connect to an already running redis box pod:
-```
-kubectl attach redisbox -c redisbox -i -t
-```
-
 #### Crawl progress and logs
 
 Check out the [GCP GKE Console](https://console.cloud.google.com/kubernetes/workload)
@@ -239,7 +234,7 @@ The crawl data will end up in Parquet format in the S3 bucket that you configure
 ```
 kubectl delete -f crawl.yaml
 gcloud redis instances delete crawlredis --region=us-central1
-kubectl delete pod redisbox
+kubectl delete -f redis-box.yaml
 ```
 
 ### Decrease the size of the cluster while it is not in use
diff --git a/deployment/gcp/redis-box.yaml b/deployment/gcp/redis-box.yaml
new file mode 100644
index 0000000..8c150af
--- /dev/null
+++ b/deployment/gcp/redis-box.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: redis-box
+  labels:
+    app: redis-box
+spec:
+  containers:
+    - name: redis-box
+      image: gcr.io/google_containers/redis:v1
diff --git a/deployment/load_site_list_into_redis.sh b/deployment/load_site_list_into_redis.sh
index 6d728d0..a818e27 100755
--- a/deployment/load_site_list_into_redis.sh
+++ b/deployment/load_site_list_into_redis.sh
@@ -1,6 +1,11 @@
 #!/usr/bin/env bash
 set -e
 
+if [[ "$REDIS_HOST" == "" ]]; then
+    echo "The env var $REDIS_HOST needs to be set with the IP/hostname of the managed Redis instance" >&2
+    exit 1
+fi
+
 if [[ $# -lt 2 ]]; then
     echo "Usage: load_site_list_into_redis.sh redis_queue_name site_list_csv" >&2
     exit 1
@@ -19,7 +24,8 @@ echo "DEL $REDIS_QUEUE_NAME:processing" >> joblist.txt
 # awk #1 = Add the RPUSH command with the site value within single quotes
 cat "$SITE_LIST_CSV" | sed '1!G;h;$!d' | sed "s/'/\\\'/g" | awk -F ',' 'FNR > 0 {print "RPUSH '$REDIS_QUEUE_NAME' '\''"$1","$2"'\''"}' >> joblist.txt
 
-kubectl cp joblist.txt redis-master:/tmp/joblist.txt
-kubectl exec redis-master -- sh -c "cat /tmp/joblist.txt | redis-cli --pipe"
+kubectl apply -f redis-box.yaml
+kubectl cp joblist.txt redis-box:/tmp/joblist.txt
+kubectl exec redis-box -- sh -c "cat /tmp/joblist.txt | redis-cli -h $REDIS_HOST --pipe"
 
 rm joblist.txt

From 3aa54ccff7dd1a50d8c043df6bc7e85fbaeb57ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 13:58:00 +0300
Subject: [PATCH 05/10] Nit typo

---
 deployment/gcp/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deployment/gcp/README.md b/deployment/gcp/README.md
index f3b6adf..bbc8b1c 100644
--- a/deployment/gcp/README.md
+++ b/deployment/gcp/README.md
@@ -239,7 +239,7 @@ kubectl delete -f redis-box.yaml
 
 ### Decrease the size of the cluster while it is not in use
 
-While the cluster has autoscaling activated, and thus should scale down when not in use, it can sometimes be slow to do this or fail to do this adequately. In these instances, it is a good idea to go to `Clusters -> crawl1 -> default-pool -> Edit` and set the number of instances to 0 or 1 manually. It will still scale up when the next crawl is executed.
+While the cluster has auto-scaling activated, and thus should scale down when not in use, it can sometimes be slow to do this or fail to do this adequately. In these instances, it is a good idea to go to `Clusters -> crawl -> default-pool -> Edit` and set the number of instances to 0 or 1 manually. It will still scale up when the next crawl is executed.
 
 ### Deleting the GKE Cluster
 

From 79e142b0eacbb921ccb27da2e05a515a8a64804e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 14:06:06 +0300
Subject: [PATCH 06/10] Moved out deployment of redis-box from helper script

---
 deployment/gcp/README.md                | 12 ++++++++----
 deployment/load_site_list_into_redis.sh |  1 -
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/deployment/gcp/README.md b/deployment/gcp/README.md
index bbc8b1c..aa05a17 100644
--- a/deployment/gcp/README.md
+++ b/deployment/gcp/README.md
@@ -100,12 +100,17 @@ Remember to change the `crawl.yaml` to point to `image: gcr.io/$PROJECT/openwpm`
 
 ## Deploy the redis server which we use for the work queue
 
-This will launch a 1GB Basic tier Google Cloud Memorystore for Redis instance ($0.049/GB/hour):
+Launch a 1GB Basic tier Google Cloud Memorystore for Redis instance ($0.049/GB/hour):
 ```
 gcloud redis instances create crawlredis --size=1 --region=us-central1 --redis-version=redis_4_0
 ```
 
-Next, use the following output:
+Launch a temporary redis-box pod deployed to the cluster which we use to interact with the above Redis instance:
+```
+kubectl apply -f redis-box.yaml
+```
+
+Use the following output:
 ```
 gcloud redis instances describe crawlredis --region=us-central1
 ```
@@ -178,9 +183,8 @@ Note that for the remainder of these instructions, `metadata.name` is assumed to
 
 #### Queue status
 
-Open a temporary instance and launch redis-cli:
+Launch redis-cli:
 ```
-kubectl apply -f redis-box.yaml
 kubectl exec -it redis-box -- sh -c "redis-cli -h $REDIS_HOST"
 ```
 
diff --git a/deployment/load_site_list_into_redis.sh b/deployment/load_site_list_into_redis.sh
index a818e27..5489fb8 100755
--- a/deployment/load_site_list_into_redis.sh
+++ b/deployment/load_site_list_into_redis.sh
@@ -24,7 +24,6 @@ echo "DEL $REDIS_QUEUE_NAME:processing" >> joblist.txt
 # awk #1 = Add the RPUSH command with the site value within single quotes
 cat "$SITE_LIST_CSV" | sed '1!G;h;$!d' | sed "s/'/\\\'/g" | awk -F ',' 'FNR > 0 {print "RPUSH '$REDIS_QUEUE_NAME' '\''"$1","$2"'\''"}' >> joblist.txt
 
-kubectl apply -f redis-box.yaml
 kubectl cp joblist.txt redis-box:/tmp/joblist.txt
 kubectl exec redis-box -- sh -c "cat /tmp/joblist.txt | redis-cli -h $REDIS_HOST --pipe"
 

From 8cfd59d5bedcd0c30c7bc0fbac29ae8ad0c21fcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 14:21:57 +0300
Subject: [PATCH 07/10] Do not start the redis server in redis-box

---
 deployment/gcp/redis-box.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/deployment/gcp/redis-box.yaml b/deployment/gcp/redis-box.yaml
index 8c150af..c222787 100644
--- a/deployment/gcp/redis-box.yaml
+++ b/deployment/gcp/redis-box.yaml
@@ -7,4 +7,7 @@ metadata:
 spec:
   containers:
     - name: redis-box
-      image: gcr.io/google_containers/redis:v1
+      image: redis:4
+      # avoids starting the redis-server
+      command: ["tail"]
+      args: ["-f", "/dev/null"]

From dac977c975b66fa487598473b9994c38191a91ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 17:14:37 +0300
Subject: [PATCH 08/10] Added commands and docs for resizing the cluster before
 and after the crawls

---
 deployment/gcp/README.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/deployment/gcp/README.md b/deployment/gcp/README.md
index aa05a17..3f655c2 100644
--- a/deployment/gcp/README.md
+++ b/deployment/gcp/README.md
@@ -169,6 +169,16 @@ Use of `envsubst` has already replaced `$REDIS_HOST` with the value of the env v
 
 Note: A useful naming convention for `CRAWL_DIRECTORY` is `YYYY-MM-DD_description_of_the_crawl`.
 
+### Scale up the cluster before running the crawl
+
+Some nodes including the master node can become temporarily unavailable  during cluster auto-scaling operations. When larger new crawls are started, this can cause disruptions for a couple of minutes after the crawl has started.
+
+To avoid this, set the amount of nodes (to, say, 15) before starting the crawl:
+
+```
+gcloud container clusters resize crawl1 --num-nodes=15
+```
+
 ## Start the crawl
 
 When you are ready, deploy the crawl:
@@ -243,7 +253,13 @@ kubectl delete -f redis-box.yaml
 
 ### Decrease the size of the cluster while it is not in use
 
-While the cluster has auto-scaling activated, and thus should scale down when not in use, it can sometimes be slow to do this or fail to do this adequately. In these instances, it is a good idea to go to `Clusters -> crawl -> default-pool -> Edit` and set the number of instances to 0 or 1 manually. It will still scale up when the next crawl is executed.
+While the cluster has auto-scaling activated, and thus should scale down when not in use, it can sometimes be slow to do this or fail to do this adequately. In these instances, it is a good idea to set the number of nodes to 0 or 1 manually:
+
+```
+gcloud container clusters resize crawl1 --num-nodes=1
+```
+
+It will still auto-scale up when the next crawl is executed.
 
 ### Deleting the GKE Cluster
 

From 6270a7d82593ce51c9d5722dbc261b8f1174e825 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 17:16:59 +0300
Subject: [PATCH 09/10] Raised backoffLimit from default 6 to 10000 to avoid
 large crawls failing due to sporadic worker crashes

---
 deployment/gcp/crawl.tmpl.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deployment/gcp/crawl.tmpl.yaml b/deployment/gcp/crawl.tmpl.yaml
index abb43ae..6224577 100644
--- a/deployment/gcp/crawl.tmpl.yaml
+++ b/deployment/gcp/crawl.tmpl.yaml
@@ -5,6 +5,7 @@ metadata:
 spec:
   # adjust for parallelism
   parallelism: 100
+  backoffLimit: 10000 # to avoid crawls failing due to sporadic worker crashes
   template:
     metadata:
       name: openwpm-crawl

From dd5a2f0449130d82c3bc87cd1b34ff66107935f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fredrik=20Wollse=CC=81n?= <fredrik@neam.se>
Date: Thu, 1 Aug 2019 17:18:48 +0300
Subject: [PATCH 10/10] Added hint in crawl config template regarding resource
 allocation settings

---
 deployment/gcp/crawl.tmpl.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deployment/gcp/crawl.tmpl.yaml b/deployment/gcp/crawl.tmpl.yaml
index 6224577..61586cf 100644
--- a/deployment/gcp/crawl.tmpl.yaml
+++ b/deployment/gcp/crawl.tmpl.yaml
@@ -59,6 +59,8 @@ spec:
           # these are taken at face value by the autoscaler, so they should match actual
           # resources required by any single instance/container as good as possible
           # see: https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler
+          # tip: observe `kubectl top nodes` during auto-scaled crawls to get an idea of how
+          # resources are being utilized
           requests:
             cpu: 750m
           limits: