diff --git a/.github/workflows/tilt.yml b/.github/workflows/tilt.yml new file mode 100644 index 000000000..60ff132b6 --- /dev/null +++ b/.github/workflows/tilt.yml @@ -0,0 +1,98 @@ +name: "Tilt CI" +on: + pull_request: + branches: [ main ] +jobs: + tests: + runs-on: ubuntu-latest + steps: + - name: Login to GCR + uses: docker/login-action@v3 + with: + registry: gcr.io + username: _json_key + password: ${{ secrets.GCR_JSON_KEY }} + - uses: actions/checkout@v3 + - uses: cachix/install-nix-action@v22 + with: + github_access_token: ${{ secrets.GITHUB_TOKEN }} + - id: "gcp-auth" + uses: "google-github-actions/auth@v2" + with: + credentials_json: "${{ secrets.GOOGLE_CREDENTIALS }}" + - name: Setup container monitoring + run: | + mkdir -p ./logs + # Start monitoring in the background and log the process ID + (while true; do + echo "$(date '+%Y-%m-%d %H:%M:%S')" >> ./logs/container_memory.log + docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}" >> ./logs/container_memory.log + echo "---" >> ./logs/container_memory.log + sleep 1 + done) & + MONITOR_PID=$! + echo $MONITOR_PID > monitor.pid + + # Add container status monitoring + (while true; do + echo "=== Container Status at $(date '+%Y-%m-%d %H:%M:%S.%N') ===" >> ./logs/container_status.log + docker ps -a --format "{{.Names}}\t{{.Status}}\t{{.State}}" >> ./logs/container_status.log + sleep 0.1 + done) & + STATUS_PID=$! + echo $STATUS_PID > status.pid + + # Wait for monitoring to start and verify it's running + sleep 5 + if ! kill -0 $MONITOR_PID 2>/dev/null || ! kill -0 $STATUS_PID 2>/dev/null; then + echo "Monitoring failed to start" + exit 1 + fi + + - name: Reset Deps + run: | + # Verify monitoring is still running + if ! kill -0 $(cat monitor.pid) 2>/dev/null; then + echo "Monitoring stopped unexpectedly" + exit 1 + fi + nix develop -c make reset-deps + - name: Tilt CI + run: nix develop -c make tilt-in-ci + env: + TF_VAR_sa_creds: ${{ secrets.GOOGLE_SA_BASE64 }} + # Add these two steps: + - name: Collect failure information + if: failure() + run: | + echo "=== Docker Container Status ===" > ./logs/failure_info.log + docker ps -a >> ./logs/failure_info.log + echo "\n=== Failed Container Logs ===" >> ./logs/failure_info.log + # Add specific logging for cala container + echo "\n=== Cala Container Logs ===" >> ./logs/failure_info.log + docker logs lana-bank-cala-1 &>> ./logs/failure_info.log + # Add container inspection + echo "\n=== Cala Container Inspect ===" >> ./logs/failure_info.log + docker inspect lana-bank-cala-1 >> ./logs/failure_info.log + docker logs lana-bank-cala-1 >> ./logs/failure_info.log + docker logs lana-bank-cala-1 2>&1 || true + # Then update the stop monitoring step to kill both processes: + - name: Stop monitoring + if: always() + run: | + if [ -f monitor.pid ]; then + kill $(cat monitor.pid) || echo "Monitoring process already stopped." + rm -f monitor.pid + fi + if [ -f status.pid ]; then + kill $(cat status.pid) || echo "Status monitoring process already stopped." + rm -f status.pid + fi + sleep 5 # Allow log flush + + - name: Upload memory logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: debug-logs + path: ./logs/*.log diff --git a/Makefile b/Makefile index 48cde8928..8d2f1f1a0 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ init-bq: delete-bq-tables reset-tf-state clean-deps start-deps setup-db reset-deps: reset-tf-state clean-deps start-deps setup-db run-tf run-server: - cargo run --bin lana-cli --features sim-time -- --config ./bats/lana-sim-time.yml | tee .e2e-logs + cargo run -j 4 --bin lana-cli --features sim-time -- --config ./bats/lana-sim-time.yml | tee .e2e-logs check-code: sdl git diff --exit-code lana/admin-server/src/graphql/schema.graphql diff --git a/dev/Tiltfile b/dev/Tiltfile index 12445f9ea..65344b7f7 100644 --- a/dev/Tiltfile +++ b/dev/Tiltfile @@ -13,65 +13,65 @@ for service, deps in docker_groups.items(): for dep in deps: dc_resource(dep, labels=[service]) -local_resource( - "core", - labels=["core"], - serve_cmd="cd .. && make run-server", - serve_env={ - "PG_CON": "postgres://user:password@localhost:5433/pg", - "OTEL_EXPORTER_OTLP_ENDPOINT": "http://localhost:4317", - }, - readiness_probe=probe( - period_secs=5, - http_get=http_get_action( - path="/admin/graphql", - port=4455, - ), - ), - allow_parallel=True, - resource_deps=[ - "cala", - "core-pg", - "cala-pg", - ], - links = [ - link("http://localhost:4455/admin/graphql", "playground"), - ] -) +#local_resource( +# "core", +# labels=["core"], +# serve_cmd="cd .. && make run-server", +# serve_env={ +# "PG_CON": "postgres://user:password@localhost:5433/pg", +# "OTEL_EXPORTER_OTLP_ENDPOINT": "http://localhost:4317", +# }, +# readiness_probe=probe( +# period_secs=5, +# http_get=http_get_action( +# path="/admin/graphql", +# port=4455, +# ), +# ), +# allow_parallel=True, +# resource_deps=[ +# "cala", +# "core-pg", +# "cala-pg", +# ], +# links = [ +# link("http://localhost:4455/admin/graphql", "playground"), +# ] +#) -local_resource( - "admin-panel", - labels=["apps"], - serve_env={ - "NEXT_PUBLIC_BASE_PATH": "/admin-panel" - }, - serve_cmd="cd ../apps/admin-panel && pnpm install && pnpm run dev", - readiness_probe = probe( - period_secs = 5, - http_get = http_get_action( - path = "/admin-panel", - port = 4455, - ), - ), - allow_parallel=True, - resource_deps=[ - "next-auth-pg", - "mailhog", - "core", - ], - links = [ - link("http://localhost:4455/admin-panel", "admin-panel"), - ] -) - - -if is_ci: - local_resource( - name="cypress", - labels=["apps-ci-test"], - cmd="cd ../apps/admin-panel && pnpm cypress:run headless", - allow_parallel=True, - resource_deps=[ - "admin-panel" - ], - ) +#local_resource( +# "admin-panel", +# labels=["apps"], +# serve_env={ +# "NEXT_PUBLIC_BASE_PATH": "/admin-panel" +# }, +# serve_cmd="cd ../apps/admin-panel && pnpm install && pnpm run dev", +# readiness_probe = probe( +# period_secs = 5, +# http_get = http_get_action( +# path = "/admin-panel", +# port = 4455, +# ), +# ), +# allow_parallel=True, +# resource_deps=[ +# "next-auth-pg", +# "mailhog", +# "core", +# ], +# links = [ +# link("http://localhost:4455/admin-panel", "admin-panel"), +# ] +#) +# +# +#if is_ci: +# local_resource( +# name="cypress", +# labels=["apps-ci-test"], +# cmd="cd ../apps/admin-panel && pnpm cypress:run headless", +# allow_parallel=True, +# resource_deps=[ +# "admin-panel" +# ], +# ) diff --git a/dev/bin/tilt-ci.sh b/dev/bin/tilt-ci.sh index 896ef7220..86c2d2d9e 100755 --- a/dev/bin/tilt-ci.sh +++ b/dev/bin/tilt-ci.sh @@ -1,17 +1,39 @@ #!/bin/bash +set -eo pipefail # Exit on error, pipe failure REPO_ROOT=$(git rev-parse --show-toplevel) +LOGS_DIR="${REPO_ROOT}/logs" +mkdir -p "${LOGS_DIR}" +# Source CI environment if exists [ -f tmp.env.ci ] && source tmp.env.ci || true cd "${REPO_ROOT}" -tilt ci --file dev/Tiltfile | tee tilt.log | grep cypress + +# Run Tilt with full logging +echo "Starting Tilt CI at $(date)" | tee -a "${LOGS_DIR}/tilt-full.log" +tilt ci --file dev/Tiltfile 2>&1 | tee -a "${LOGS_DIR}/tilt-full.log" | tee >(grep cypress > "${LOGS_DIR}/cypress.log") | grep cypress + status=${PIPESTATUS[0]} -if [[ $status -eq 0 ]]; then - echo "Tilt CI passed" +# Collect additional information on failure +if [[ $status -ne 0 ]]; then + echo "Tilt CI failed with status $status at $(date)" | tee -a "${LOGS_DIR}/tilt-full.log" + echo "=== Tilt Logs ===" | tee -a "${LOGS_DIR}/failure.log" + tail -n 100 "${LOGS_DIR}/tilt-full.log" >> "${LOGS_DIR}/failure.log" + + echo "=== Container Status ===" | tee -a "${LOGS_DIR}/failure.log" + docker ps -a >> "${LOGS_DIR}/failure.log" + + echo "=== Failed Container Logs ===" | tee -a "${LOGS_DIR}/failure.log" + docker ps -a --filter "status=exited" --format '{{.Names}}' | while read container; do + echo "=== $container ===" >> "${LOGS_DIR}/failure.log" + docker logs $container &>> "${LOGS_DIR}/failure.log" + done + + cat "${LOGS_DIR}/failure.log" else - cat tilt.log + echo "Tilt CI passed at $(date)" | tee -a "${LOGS_DIR}/tilt-full.log" fi -exit "$status" +exit "$status" \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 9958faa37..b45a7ab5a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,8 +12,10 @@ services: interval: 1s timeout: 1s retries: 20 + mem_limit: 1g cala-pg: image: postgres:16.4 + mem_limit: 512m command: -c 'max_connections=200' ports: - "5432:5432" @@ -32,6 +34,7 @@ services: - "2252:2252" volumes: - ./dev/cala.yml:/cala.yml + mem_limit: 512m environment: - PG_CON=postgresql://user:password@cala-pg:5432/pg - CALA_CONFIG=/cala.yml @@ -41,6 +44,7 @@ services: - cala-pg - otel-agent otel-agent: + mem_limit: 256m ports: - "4317:4317" # OpenTelemetry receiver image: otel/opentelemetry-collector-contrib:0.57.2 @@ -52,6 +56,7 @@ services: - ./dev/otel-agent-config.yaml:/etc/otel-agent-config.yaml kratos: image: oryd/kratos:v1.2.0 + mem_limit: 512m extra_hosts: - "dockerhost-alias:host-gateway" ports: @@ -68,6 +73,7 @@ services: - ./dev/ory:/home/ory kratos-pg: image: postgres:16.4 + mem_limit: 512m ports: - "5434:5432" environment: @@ -76,6 +82,7 @@ services: - POSTGRES_DB=default next-auth-pg: image: postgres:16.4 + mem_limit: 512m ports: - "5435:5432" environment: @@ -84,6 +91,7 @@ services: - POSTGRES_DB=default oathkeeper: image: oryd/oathkeeper:v0.40.7-distroless + mem_limit: 512m extra_hosts: - "dockerhost-alias:host-gateway" ports: @@ -96,6 +104,7 @@ services: - kratos - otel-agent mailhog: + mem_limit: 512m image: mailhog/mailhog:latest ports: - "1025:1025"