From 7c2179b9d370eed1986fa68da78d80c5479b7280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Liqin=20Zhang=F0=9F=98=88?= Date: Thu, 19 Dec 2024 21:31:01 +0000 Subject: [PATCH 1/2] Integrating Shipshape with ai-on-gke: Helm Scan Integration This PR initiates the integration of Shipshape security scans into the ai-on-gke repository, starting with Phase 1: Helm Scan Onboarding. Purpose: This PR introduces a Cloud Build workflow to automatically perform Helm scans on the ai-on-gke repository using the Shipshape validation service. This initial integration focuses on scanning Helm charts for the iap and kuberay-tpu-webhook components. These components were selected because they are owned and fully controlled by the ai-on-gke team and do not require a cluster scan. Implementation: A new Cloud Build configuration file violation_scan_helm.yaml is added to trigger Helm scans on pull requests. The workflow utilizes a Docker image from the validation-service-agent repository to execute the scans. The scan targets all Helm charts within the repository. An initial allowlist files are included to manage accepted policy exceptions. The build will fail if any violations are found outside the allowlist. Next Steps: Phase 2: Identify and onboard focus components based on ownership established in collaboration with the ai-eco team. Phase 3: Implement a component ignore feature and transition to a secure-by-default model for all remaining components. Continuously update the allowlist in collaboration with the ai-eco team and establish a prioritized remediation strategy. Track and report on success metrics, including the number of violations found, PRs blocked, and violations fixed. Related Issues: b/377714818 b/378933059 b/382726583 Future Considerations: Integrate cluster scans for comprehensive security analysis. Synchronize violations with the Shipshape dashboard for improved visualization and tracking. Set up office hours with GKE Security experts for consultation and guidance on addressing violations. This PR marks a significant step towards enhancing the security and compliance of the ai-on-gke project by proactively identifying and addressing potential vulnerabilities in Kubernetes configurations. --- .../category/helm/iap/defaultnamespace.json | 21 +++++++++ .../allowprivilegeescalation.json | 17 +++++++ .../kuberay-tpu-webhook/capabilities.json | 17 +++++++ .../helm/kuberay-tpu-webhook/imagedigest.json | 18 ++++++++ .../kuberay-tpu-webhook/imagefreshness.json | 18 ++++++++ .../helm/kuberay-tpu-webhook/imagepath.json | 18 ++++++++ .../kuberay-tpu-webhook/readonlyrootfs.json | 17 +++++++ .../helm/kuberay-tpu-webhook/rootless.json | 17 +++++++ .../kuberay-tpu-webhook/seccompprofile.json | 13 ++++++ violation_scan_helm.yaml | 45 +++++++++++++++++++ 10 files changed, 201 insertions(+) create mode 100644 security_test/allowlist/category/helm/iap/defaultnamespace.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/allowprivilegeescalation.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/capabilities.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/imagedigest.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/imagefreshness.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/imagepath.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/readonlyrootfs.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/rootless.json create mode 100644 security_test/allowlist/category/helm/kuberay-tpu-webhook/seccompprofile.json create mode 100644 violation_scan_helm.yaml diff --git a/security_test/allowlist/category/helm/iap/defaultnamespace.json b/security_test/allowlist/category/helm/iap/defaultnamespace.json new file mode 100644 index 000000000..c3a3f175f --- /dev/null +++ b/security_test/allowlist/category/helm/iap/defaultnamespace.json @@ -0,0 +1,21 @@ +[ + { + "message": "Ingress \"iap-ingress\" is in the default namespace, which is not allowed.", + "policyName": "defaultnamespace", + "resourceKey": { + "group": "networking.k8s.io", + "kind": "Ingress", + "name": "iap-ingress", + "version": "v1" + } + }, + { + "message": "Secret \"iap-secret\" is in the default namespace, which is not allowed.", + "policyName": "defaultnamespace", + "resourceKey": { + "kind": "Secret", + "name": "iap-secret", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/allowprivilegeescalation.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/allowprivilegeescalation.json new file mode 100644 index 000000000..e958173f3 --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/allowprivilegeescalation.json @@ -0,0 +1,17 @@ +[ + { + "details": { + "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", + "containerName": "kuberay-tpu-webhook" + }, + "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not set allowPrivilegeEscalation: false in its securityContext. See go/gke-shipshape#allowprivilegeescalation for more details", + "policyName": "allowprivilegeescalation", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/capabilities.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/capabilities.json new file mode 100644 index 000000000..ee24e2476 --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/capabilities.json @@ -0,0 +1,17 @@ +[ + { + "details": { + "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", + "containerName": "kuberay-tpu-webhook" + }, + "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details", + "policyName": "capabilities", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagedigest.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagedigest.json new file mode 100644 index 000000000..0ac490949 --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagedigest.json @@ -0,0 +1,18 @@ +[ + { + "details": { + "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", + "containerName": "kuberay-tpu-webhook", + "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0" + }, + "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e", + "policyName": "imagedigest", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagefreshness.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagefreshness.json new file mode 100644 index 000000000..12c541ee7 --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagefreshness.json @@ -0,0 +1,18 @@ +[ + { + "details": { + "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", + "containerName": "kuberay-tpu-webhook", + "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0" + }, + "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" that does not have a valid digest.", + "policyName": "imagefreshness", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagepath.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagepath.json new file mode 100644 index 000000000..ff7f4632e --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagepath.json @@ -0,0 +1,18 @@ +[ + { + "details": { + "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", + "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0", + "containerName": "kuberay-tpu-webhook" + }, + "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" with an invalid path. See go/gke-shipshape#imagepath for valid image paths.", + "policyName": "imagepath", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/readonlyrootfs.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/readonlyrootfs.json new file mode 100644 index 000000000..ac1c33537 --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/readonlyrootfs.json @@ -0,0 +1,17 @@ +[ + { + "details": { + "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", + "containerName": "kuberay-tpu-webhook" + }, + "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.", + "policyName": "readonlyrootfs", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/rootless.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/rootless.json new file mode 100644 index 000000000..6ce918ef7 --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/rootless.json @@ -0,0 +1,17 @@ +[ + { + "details": { + "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", + "containerName": "kuberay-tpu-webhook" + }, + "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" is running as root. Update the container to run as non-root. See go/gke-shipshape#rootless for more details", + "policyName": "rootless", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/security_test/allowlist/category/helm/kuberay-tpu-webhook/seccompprofile.json b/security_test/allowlist/category/helm/kuberay-tpu-webhook/seccompprofile.json new file mode 100644 index 000000000..4fa68ac67 --- /dev/null +++ b/security_test/allowlist/category/helm/kuberay-tpu-webhook/seccompprofile.json @@ -0,0 +1,13 @@ +[ + { + "message": "pod in Deployment \"kuberay-tpu-webhook\" must set securityContext.seccompProfile.type to value RuntimeDefault", + "policyName": "seccompprofile", + "resourceKey": { + "group": "apps", + "kind": "Deployment", + "name": "kuberay-tpu-webhook", + "namespace": "ray-system", + "version": "v1" + } + } +] diff --git a/violation_scan_helm.yaml b/violation_scan_helm.yaml new file mode 100644 index 000000000..82c7d0724 --- /dev/null +++ b/violation_scan_helm.yaml @@ -0,0 +1,45 @@ +substitutions: + _IMAGE: us-docker.pkg.dev/zlq-gke-dev/validation-service-agent/agent@sha256:7e1f0b84e9713367cb197182fe3834fde416448a64b94337ba0ea7154e4cc519 + +steps: +- name: 'ubuntu' + id: 'Copy Folder Locally' + entrypoint: 'bash' + args: + - '-c' + - | + mkdir -p security_test/scan_target/ && find . -mindepth 1 -maxdepth 1 -type d ! -name "security_test" -exec cp -r {} security_test/scan_target/ \; + +- name: 'ubuntu' + id: 'Copy metadata' + entrypoint: 'bash' + args: + - '-c' + - | + mkdir -p /workspace/security_test/scan_target + # Exclude /workspace/security_test from the copy to avoid recursive issue + find . -mindepth 1 -maxdepth 1 ! -path "./security_test" -exec cp -r {} /workspace/security_test/scan_target/ \; + chown -R 65532:65532 /workspace/security_test/scan_target + +- name: 'gcr.io/cloud-builders/docker' + id: 'Run Docker Image' + args: + - 'run' + - '--network=cloudbuild' + - '--rm' + - '-v' + - '/workspace/security_test/allowlist:/workspace/security_test/allowlist' + - '-v' + - '/workspace/security_test/scan_target:/workspace/security_test/scan_target' + - '${_IMAGE}' + - '--mode=helm' + - '--allowlist_folder=/workspace/security_test/allowlist' + - '--scan_path=/workspace/security_test/scan_target' + - '--max_wait_duration=60' + + +# Fail the build if there are any violations +timeout: '12000s' + +options: + logging: CLOUD_LOGGING_ONLY From 3066168dd2f42a9b3e685d479a834c248851d906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Liqin=20Zhang=F0=9F=98=88?= Date: Thu, 19 Dec 2024 21:36:16 +0000 Subject: [PATCH 2/2] Change to official repo --- violation_scan_helm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/violation_scan_helm.yaml b/violation_scan_helm.yaml index 82c7d0724..ef7e746ab 100644 --- a/violation_scan_helm.yaml +++ b/violation_scan_helm.yaml @@ -1,5 +1,5 @@ substitutions: - _IMAGE: us-docker.pkg.dev/zlq-gke-dev/validation-service-agent/agent@sha256:7e1f0b84e9713367cb197182fe3834fde416448a64b94337ba0ea7154e4cc519 + _IMAGE: us-docker.pkg.dev/k8ssecurityvalidation-agent/k8ssecurityvalidation-agent/k8ssecurityvalidation-agent@sha256:7eaedb4153841b814e6b5367e63214318cb3318f902427b9214474e1256d0b37 steps: - name: 'ubuntu'