From 49f5b766223b9d6bcf348ad2c7dbcae77abbe76b Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Thu, 12 Oct 2023 14:57:51 -0400 Subject: [PATCH 1/4] Add CI and AWS terraform Why these changes are being introduced: This final PR adds Github CI actions and terraform code to project. This was formerly in place, but removed during the full PR review process. How this addresses that need: * adds .github folder * adds terraform generated AWS CLI commands to Makefile * adds convenience Makefile command to run a test harvest in Dev1 Side effects of this change: * Github actions now fire Relevant ticket(s): https://mitlibraries.atlassian.net/browse/TIMX-247 --- .github/dependabot.yml | 18 ++++++++++++++ .github/pull-request-template.md | 39 ++++++++++++++++++++++++++++++ .github/workflows/ci.yml | 7 ++++++ .github/workflows/dev-build.yml | 24 ++++++++++++++++++ .github/workflows/prod-promote.yml | 21 ++++++++++++++++ .github/workflows/stage-build.yml | 24 ++++++++++++++++++ Makefile | 34 +++++++++++++++++++++++++- README.md | 19 +++++++-------- bin/test-harvest-ecs-dev1.sh | 13 ++++++++++ 9 files changed, 188 insertions(+), 11 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .github/pull-request-template.md create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/dev-build.yml create mode 100644 .github/workflows/prod-promote.yml create mode 100644 .github/workflows/stage-build.yml create mode 100755 bin/test-harvest-ecs-dev1.sh diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..0e71ac3 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,18 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + + # Maintain dependencies for npm + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/pull-request-template.md b/.github/pull-request-template.md new file mode 100644 index 0000000..fc70031 --- /dev/null +++ b/.github/pull-request-template.md @@ -0,0 +1,39 @@ +### What does this PR do? + +Describe the overall purpose of the PR changes. Doesn't need to be as specific as the +individual commits. + +### Helpful background context + +Describe any additional context beyond what the PR accomplishes if it is likely to be +useful to a reviewer. + +Delete this section if it isn't applicable to the PR. + +### How can a reviewer manually see the effects of these changes? + +Explain how to see the proposed changes in the application if possible. + +Delete this section if it isn't applicable to the PR. + +### Includes new or updated dependencies? + +YES | NO + +### What are the relevant tickets? + +Include links to Jira Software and/or Jira Service Management tickets here. + +### Developer + +- [ ] All new ENV is documented in README (or there is none) +- [ ] Stakeholder approval has been confirmed (or is not needed) + +### Code Reviewer + +- [ ] The commit message is clear and follows our guidelines + (not just this pull request message) +- [ ] There are appropriate tests covering any new functionality +- [ ] The documentation has been updated or is unnecessary +- [ ] The changes have been verified +- [ ] New dependencies are appropriate or there were no changes diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..e4937dd --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,7 @@ +name: CI +on: push +jobs: + test: + uses: mitlibraries/.github/.github/workflows/python-shared-test.yml@main + lint: + uses: mitlibraries/.github/.github/workflows/python-shared-lint.yml@main diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml new file mode 100644 index 0000000..d059288 --- /dev/null +++ b/.github/workflows/dev-build.yml @@ -0,0 +1,24 @@ +### This is the Terraform-generated dev-build.yml workflow for the browsertrix-harvester-dev app repository ### +### If this is a Lambda repo, uncomment the FUNCTION line at the end of the document ### +### If the container requires any additional pre-build commands, uncomment and edit ### +### the PREBUILD line at the end of the document. ### +name: Dev Container Build and Deploy +on: + workflow_dispatch: + pull_request: + branches: + - main + paths-ignore: + - '.github/**' + +jobs: + deploy: + name: Dev Container Deploy + uses: mitlibraries/.github/.github/workflows/ecr-shared-deploy-dev.yml@main + secrets: inherit + with: + AWS_REGION: "us-east-1" + GHA_ROLE: "browsertrix-harvester-gha-dev" + ECR: "browsertrix-harvester-dev" + # FUNCTION: "" + # PREBUILD: \ No newline at end of file diff --git a/.github/workflows/prod-promote.yml b/.github/workflows/prod-promote.yml new file mode 100644 index 0000000..143e685 --- /dev/null +++ b/.github/workflows/prod-promote.yml @@ -0,0 +1,21 @@ +### This is the Terraform-generated prod-promote.yml workflow for the browsertrix-harvester-prod repository. ### +### If this is a Lambda repo, uncomment the FUNCTION line at the end of the document. ### +name: Prod Container Promote +on: + workflow_dispatch: + release: + types: [published] + +jobs: + deploy: + name: Prod Container Promote + uses: mitlibraries/.github/.github/workflows/ecr-shared-promote-prod.yml@main + secrets: inherit + with: + AWS_REGION: "us-east-1" + GHA_ROLE_STAGE: browsertrix-harvester-gha-stage + GHA_ROLE_PROD: browsertrix-harvester-gha-prod + ECR_STAGE: "browsertrix-harvester-stage" + ECR_PROD: "browsertrix-harvester-prod" + # FUNCTION: "" + \ No newline at end of file diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml new file mode 100644 index 0000000..6785609 --- /dev/null +++ b/.github/workflows/stage-build.yml @@ -0,0 +1,24 @@ +### This is the Terraform-generated dev-build.yml workflow for the browsertrix-harvester-stage app repository ### +### If this is a Lambda repo, uncomment the FUNCTION line at the end of the document ### +### If the container requires any additional pre-build commands, uncomment and edit ### +### the PREBUILD line at the end of the document. ### +name: Stage Container Build and Deploy +on: + workflow_dispatch: + push: + branches: + - main + paths-ignore: + - '.github/**' + +jobs: + deploy: + name: Stage Container Deploy + uses: mitlibraries/.github/.github/workflows/ecr-shared-deploy-stage.yml@main + secrets: inherit + with: + AWS_REGION: "us-east-1" + GHA_ROLE: "browsertrix-harvester-gha-stage" + ECR: "browsertrix-harvester-stage" + # FUNCTION: "" + # PREBUILD: diff --git a/Makefile b/Makefile index 8e64981..92da874 100644 --- a/Makefile +++ b/Makefile @@ -69,4 +69,36 @@ test-harvest-local: test-parse-url-content: pipenv run harvester parse-url-content \ --wacz-input-file="tests/fixtures/example.wacz" \ - --url="https://example.com/hello-world" \ No newline at end of file + --url="https://example.com/hello-world" + +# remote ecs task crawl in Dev1 +test-harvest-ecs-dev1: + bin/test-harvest-ecs-dev1.sh + +### Terraform-generated Developer Deploy Commands for Dev environment ### +dist-dev: ## Build docker container (intended for developer-based manual build) + docker build --platform linux/amd64 \ + -t $(ECR_URL_DEV):latest \ + -t $(ECR_URL_DEV):`git describe --always` \ + -t $(ECR_NAME_DEV):latest . + +publish-dev: dist-dev ## Build, tag and push (intended for developer-based manual publish) + docker login -u AWS -p $$(aws ecr get-login-password --region us-east-1) $(ECR_URL_DEV) + docker push $(ECR_URL_DEV):latest + docker push $(ECR_URL_DEV):`git describe --always` + +### Terraform-generated manual shortcuts for deploying to Stage. This requires ### +### that ECR_NAME_STAGE, ECR_URL_STAGE, and FUNCTION_STAGE environment ### +### variables are set locally by the developer and that the developer has ### +### authenticated to the correct AWS Account. The values for the environment ### +### variables can be found in the stage_build.yml caller workflow. ### +dist-stage: ## Only use in an emergency + docker build --platform linux/amd64 \ + -t $(ECR_URL_STAGE):latest \ + -t $(ECR_URL_STAGE):`git describe --always` \ + -t $(ECR_NAME_STAGE):latest . + +publish-stage: ## Only use in an emergency + docker login -u AWS -p $$(aws ecr get-login-password --region us-east-1) $(ECR_URL_STAGE) + docker push $(ECR_URL_STAGE):latest + docker push $(ECR_URL_STAGE):`git describe --always` \ No newline at end of file diff --git a/README.md b/README.md index 0c9b64f..d471be4 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,15 @@ This Make command kicks off a harvest via a local docker container. The Make co The argument `--metadata-output-file="/crawls/collections/homepage/homepage.xml"` instructs the harvest to parse metadata records from the crawl, which are written to the container, and should then be available on the _host_ machine at: `output/crawls/collections/homepage/homepage.xml`. +### Remote Test Crawl + +```shell +make test-harvest-ecs-dev1 +``` + * Set AWS credentials are required in calling context + * Kicks off an ECS Fargate task in Dev1 + * WACZ file and metadata file are written to S3 at `timdex-extract-dev-222053980223/librarywebsite/test-harvest-ecs-$CURRENT_DATE.xml|wacz` + ## CLI commands ### Main @@ -261,16 +270,6 @@ An example record from an XML output file looks like this: ``` -## Convenience Make Commands - -### Local Test Crawl - -```shell -make test-harvest-local -``` - * Performs a crawl using the container mounted config YAML `/browsertrix-harvest/tests/fixtures/lib-website-homepage.yaml` - * Metadata is written to container directory `/crawls/collections/homepage/homepage.xml`, which is mounted and available in the local `output/` folder - ## Troubleshooting ### Cannot read/write from S3 for a LOCAL docker container harvest diff --git a/bin/test-harvest-ecs-dev1.sh b/bin/test-harvest-ecs-dev1.sh new file mode 100755 index 0000000..3542415 --- /dev/null +++ b/bin/test-harvest-ecs-dev1.sh @@ -0,0 +1,13 @@ +# extract date +export CURRENT_DATE=$(date +"%Y-%m-%d") +export CRAWL_NAME=test-harvest-ecs-$CURRENT_DATE +echo "Invoking test crawl as ECS task in Dev1 named: $CRAWL_NAME" + +# invoke ECS task +aws ecs run-task \ +--cluster timdex-dev \ +--task-definition timdex-browsertrixharvester-dev \ +--launch-type="FARGATE" \ +--region us-east-1 \ +--network-configuration '{"awsvpcConfiguration": {"subnets": ["subnet-0488e4996ddc8365b","subnet-022e9ea19f5f93e65"], "securityGroups": ["sg-044033bf5f102c544"]}}' \ +--overrides '{"containerOverrides": [ {"name":"browsertrix-harvester", "command": ["--verbose", "harvest", "--crawl-name", "'"$CRAWL_NAME"'", "--config-yaml-file", "/browsertrix-harvester/tests/fixtures/lib-website-homepage.yaml", "--metadata-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.xml", "--wacz-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.wacz", "--num-workers", "2"]}]}' \ No newline at end of file From 880ca35e078d5c51e63c660fe2cb4eecd9918c79 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 13 Oct 2023 13:54:53 -0400 Subject: [PATCH 2/4] update dependabot config --- .github/dependabot.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 0e71ac3..fbd38b7 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -11,8 +11,18 @@ updates: schedule: interval: "daily" - # Maintain dependencies for npm + # Maintain dependencies for application - package-ecosystem: "pip" directory: "/" schedule: - interval: "daily" + interval: "weekly" + reviewers: + - "MITLibraries/dataeng" + + # Maintain dependencies for Docker + - package-ecosystem: "docker" + directory: "/" + schedule: + interval: "weekly" + reviewers: + - "MITLibraries/dataeng" \ No newline at end of file From 5f9002137ef8c764c04a15e20e36684aa8eb13e7 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 13 Oct 2023 14:02:19 -0400 Subject: [PATCH 3/4] update Makefile commands --- Makefile | 22 ++++++++++++++-------- README.md | 4 ++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 92da874..d76f9d6 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ ECR_URL_DEV:=222053980223.dkr.ecr.us-east-1.amazonaws.com/browsertrix-harvester- ### End of Terraform-generated header ### SHELL=/bin/bash DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ) +CURRENT_DATE=$(date +"%Y-%m-%d") ### Dependency commands ### install: # install python dependencies @@ -49,16 +50,14 @@ black-apply: ruff-apply: pipenv run ruff check --fix . -# CLI commands docker-shell: pipenv run harvester-dockerized docker-shell -# Docker commands dist-local: docker build -t $(ECR_NAME_DEV):latest . -# Testing commands -test-harvest-local: +# Test local harvest +run-harvest-local: pipenv run harvester-dockerized --verbose harvest \ --crawl-name="homepage" \ --config-yaml-file="/browsertrix-harvester/tests/fixtures/lib-website-homepage.yaml" \ @@ -66,15 +65,22 @@ test-harvest-local: --num-workers 4 \ --btrix-args-json='{"--maxPageLimit":"15"}' +# Test Dev1 harvest +run-harvest-dev: + CRAWL_NAME=test-harvest-ecs-$CURRENT_DATE aws ecs run-task \ + --cluster timdex-dev \ + --task-definition timdex-browsertrixharvester-dev \ + --launch-type="FARGATE" \ + --region us-east-1 \ + --network-configuration '{"awsvpcConfiguration": {"subnets": ["subnet-0488e4996ddc8365b","subnet-022e9ea19f5f93e65"], "securityGroups": ["sg-044033bf5f102c544"]}}' \ + --overrides '{"containerOverrides": [ {"name":"browsertrix-harvester", "command": ["--verbose", "harvest", "--crawl-name", "'"$CRAWL_NAME"'", "--config-yaml-file", "/browsertrix-harvester/tests/fixtures/lib-website-homepage.yaml", "--metadata-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.xml", "--wacz-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.wacz", "--num-workers", "2"]}]}' + +# Test local URL content parsing test-parse-url-content: pipenv run harvester parse-url-content \ --wacz-input-file="tests/fixtures/example.wacz" \ --url="https://example.com/hello-world" -# remote ecs task crawl in Dev1 -test-harvest-ecs-dev1: - bin/test-harvest-ecs-dev1.sh - ### Terraform-generated Developer Deploy Commands for Dev environment ### dist-dev: ## Build docker container (intended for developer-based manual build) docker build --platform linux/amd64 \ diff --git a/README.md b/README.md index d471be4..b043361 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ make lint #### Local Test Crawl ```shell -make test-harvest-local +make run-harvest-local ``` This Make command kicks off a harvest via a local docker container. The Make command reflects some ways in which a harvest can be configured, including local or S3 filepath to a configuration YAML, setting an output metadata file, and even passing in miscellaneous browsertrix arguments to the crawler not explicitly defined as CLI parameters in this app. @@ -52,7 +52,7 @@ The argument `--metadata-output-file="/crawls/collections/homepage/homepage.xml" ### Remote Test Crawl ```shell -make test-harvest-ecs-dev1 +make run-harvest-dev ``` * Set AWS credentials are required in calling context * Kicks off an ECS Fargate task in Dev1 From c782c8ceb25fb847c2e21253e49d0c9be424cca9 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 13 Oct 2023 14:24:15 -0400 Subject: [PATCH 4/4] fix makefile env var syntax, use timestamp --- Makefile | 6 +++--- README.md | 2 +- bin/test-harvest-ecs-dev1.sh | 13 ------------- 3 files changed, 4 insertions(+), 17 deletions(-) delete mode 100755 bin/test-harvest-ecs-dev1.sh diff --git a/Makefile b/Makefile index d76f9d6..f37f55e 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,6 @@ ECR_URL_DEV:=222053980223.dkr.ecr.us-east-1.amazonaws.com/browsertrix-harvester- ### End of Terraform-generated header ### SHELL=/bin/bash DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ) -CURRENT_DATE=$(date +"%Y-%m-%d") ### Dependency commands ### install: # install python dependencies @@ -67,13 +66,14 @@ run-harvest-local: # Test Dev1 harvest run-harvest-dev: - CRAWL_NAME=test-harvest-ecs-$CURRENT_DATE aws ecs run-task \ + CRAWL_NAME=test-harvest-ecs-$(DATETIME); \ + aws ecs run-task \ --cluster timdex-dev \ --task-definition timdex-browsertrixharvester-dev \ --launch-type="FARGATE" \ --region us-east-1 \ --network-configuration '{"awsvpcConfiguration": {"subnets": ["subnet-0488e4996ddc8365b","subnet-022e9ea19f5f93e65"], "securityGroups": ["sg-044033bf5f102c544"]}}' \ - --overrides '{"containerOverrides": [ {"name":"browsertrix-harvester", "command": ["--verbose", "harvest", "--crawl-name", "'"$CRAWL_NAME"'", "--config-yaml-file", "/browsertrix-harvester/tests/fixtures/lib-website-homepage.yaml", "--metadata-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.xml", "--wacz-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.wacz", "--num-workers", "2"]}]}' + --overrides '{"containerOverrides": [ {"name":"browsertrix-harvester", "command": ["--verbose", "harvest", "--crawl-name", "'$$CRAWL_NAME'", "--config-yaml-file", "/browsertrix-harvester/tests/fixtures/lib-website-homepage.yaml", "--metadata-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'$$CRAWL_NAME'.xml", "--wacz-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'$$CRAWL_NAME'.wacz", "--num-workers", "2"]}]}' # Test local URL content parsing test-parse-url-content: diff --git a/README.md b/README.md index b043361..ffc209c 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ make run-harvest-dev ``` * Set AWS credentials are required in calling context * Kicks off an ECS Fargate task in Dev1 - * WACZ file and metadata file are written to S3 at `timdex-extract-dev-222053980223/librarywebsite/test-harvest-ecs-$CURRENT_DATE.xml|wacz` + * WACZ file and metadata file are written to S3 at `timdex-extract-dev-222053980223/librarywebsite/test-harvest-ecs-.xml|wacz` ## CLI commands diff --git a/bin/test-harvest-ecs-dev1.sh b/bin/test-harvest-ecs-dev1.sh deleted file mode 100755 index 3542415..0000000 --- a/bin/test-harvest-ecs-dev1.sh +++ /dev/null @@ -1,13 +0,0 @@ -# extract date -export CURRENT_DATE=$(date +"%Y-%m-%d") -export CRAWL_NAME=test-harvest-ecs-$CURRENT_DATE -echo "Invoking test crawl as ECS task in Dev1 named: $CRAWL_NAME" - -# invoke ECS task -aws ecs run-task \ ---cluster timdex-dev \ ---task-definition timdex-browsertrixharvester-dev \ ---launch-type="FARGATE" \ ---region us-east-1 \ ---network-configuration '{"awsvpcConfiguration": {"subnets": ["subnet-0488e4996ddc8365b","subnet-022e9ea19f5f93e65"], "securityGroups": ["sg-044033bf5f102c544"]}}' \ ---overrides '{"containerOverrides": [ {"name":"browsertrix-harvester", "command": ["--verbose", "harvest", "--crawl-name", "'"$CRAWL_NAME"'", "--config-yaml-file", "/browsertrix-harvester/tests/fixtures/lib-website-homepage.yaml", "--metadata-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.xml", "--wacz-output-file", "s3://timdex-extract-dev-222053980223/librarywebsite/'"$CRAWL_NAME"'.wacz", "--num-workers", "2"]}]}' \ No newline at end of file