From 5488ae2dc695b0ae1aed50838751be83266215e4 Mon Sep 17 00:00:00 2001 From: Shubham Hibare <20609766+hibare@users.noreply.github.com> Date: Thu, 27 Jun 2024 13:14:38 +0530 Subject: [PATCH] feat(ingestion): Implement DB migrations & Ingestion IAC (#4) This PR introduces several key enhancements to our project: 1. Database Migration using Alembic: Implements a new database migration to modify the schema and ensure compatibility with new data ingestion requirements. - Added new Alembic migration scripts located in the `migrations/db/versions` directory for findings, scans and jobs table. - The migration script alters the database schema to include new tables and columns required for the ingestion process. 2. Infrastructure as Code (IAC) for Data Ingestion: Sets up the infrastructure to handle data ingestion using AWS services. - Added Terraform configuration files in the `infrastructure/ingestion/aws` directory. - The configuration provisions necessary AWS resources, step function, Lambda functions, and IAM roles. 3. Lambda Functions for Data Ingestion: Includes Lambda functions to ingest data from S3 into the database tables. - Implemented Lambda functions in the `infrastructure/ingestion/aws/lambda` directory. - They process incoming data files and insert the data into the appropriate database tables. 5. Ingestions: Implement findings data ingestion. --- .gitignore | 6 + .pre-commit-config.yaml | 9 + docker-compose.dev.yml | 48 +++++ infrastructure/ingestion/aws/README.md | 86 ++++++++ infrastructure/ingestion/aws/cloudwatch.tf | 17 ++ .../ingestion_sfn_definition.json | 95 +++++++++ infrastructure/ingestion/aws/iam.tf | 160 ++++++++++++++ infrastructure/ingestion/aws/lambda.tf | 100 +++++++++ .../ingestion/aws/lambda/ingestion/Dockerfile | 12 ++ .../ingestion/aws/lambda/ingestion/README.md | 37 ++++ .../aws/lambda/ingestion/ingestion.py | 68 ++++++ .../aws/lambda/ingestion/modules/__init__.py | 0 .../ingestion/modules/common/__init__.py | 0 .../aws/lambda/ingestion/modules/common/s3.py | 123 +++++++++++ .../lambda/ingestion/modules/common/utils.py | 1 + .../ingestion/modules/findings_ingestion.py | 197 ++++++++++++++++++ .../ingestion/aws/lambda/ingestion/package.sh | 6 + .../aws/lambda/ingestion/pyproject.toml | 21 ++ .../ingestion/aws/lambda/migration/Dockerfile | 12 ++ .../ingestion/aws/lambda/migration/README.md | 16 ++ .../ingestion/aws/lambda/migration/package.sh | 8 + infrastructure/ingestion/aws/locals.tf | 11 + infrastructure/ingestion/aws/outputs.tf | 3 + infrastructure/ingestion/aws/providers.tf | 23 ++ infrastructure/ingestion/aws/rds.tf | 18 ++ infrastructure/ingestion/aws/s3.tfbackend | 5 + infrastructure/ingestion/aws/secrets.tf | 19 ++ .../ingestion/aws/securitygroups.tf | 33 +++ infrastructure/ingestion/aws/sfn.tf | 13 ++ infrastructure/ingestion/aws/sts.tf | 1 + .../ingestion/aws/terraform.tfvars.example | 17 ++ infrastructure/ingestion/aws/variables.tf | 115 ++++++++++ infrastructure/ingestion/aws/vpc.tf | 22 ++ migrations/README.md | 51 +++++ migrations/alembic.ini | 116 +++++++++++ migrations/db/env.py | 79 +++++++ migrations/db/script.py.mako | 26 +++ .../9fab027609bc_create_scans_table.py | 42 ++++ .../ac05203c65bd_create_findings_table.py | 53 +++++ .../db4e0564e6df_create_jobs_table.py | 38 ++++ migrations/migrate.py | 24 +++ migrations/pyproject.toml | 20 ++ 42 files changed, 1751 insertions(+) create mode 100644 docker-compose.dev.yml create mode 100644 infrastructure/ingestion/aws/README.md create mode 100644 infrastructure/ingestion/aws/cloudwatch.tf create mode 100644 infrastructure/ingestion/aws/configuration/ingestion_sfn_definition.json create mode 100644 infrastructure/ingestion/aws/iam.tf create mode 100644 infrastructure/ingestion/aws/lambda.tf create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/Dockerfile create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/README.md create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/ingestion.py create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/modules/__init__.py create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/modules/common/__init__.py create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/modules/common/s3.py create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/modules/common/utils.py create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/modules/findings_ingestion.py create mode 100755 infrastructure/ingestion/aws/lambda/ingestion/package.sh create mode 100644 infrastructure/ingestion/aws/lambda/ingestion/pyproject.toml create mode 100644 infrastructure/ingestion/aws/lambda/migration/Dockerfile create mode 100644 infrastructure/ingestion/aws/lambda/migration/README.md create mode 100755 infrastructure/ingestion/aws/lambda/migration/package.sh create mode 100644 infrastructure/ingestion/aws/locals.tf create mode 100644 infrastructure/ingestion/aws/outputs.tf create mode 100644 infrastructure/ingestion/aws/providers.tf create mode 100644 infrastructure/ingestion/aws/rds.tf create mode 100644 infrastructure/ingestion/aws/s3.tfbackend create mode 100644 infrastructure/ingestion/aws/secrets.tf create mode 100644 infrastructure/ingestion/aws/securitygroups.tf create mode 100644 infrastructure/ingestion/aws/sfn.tf create mode 100644 infrastructure/ingestion/aws/sts.tf create mode 100644 infrastructure/ingestion/aws/terraform.tfvars.example create mode 100644 infrastructure/ingestion/aws/variables.tf create mode 100644 infrastructure/ingestion/aws/vpc.tf create mode 100644 migrations/README.md create mode 100644 migrations/alembic.ini create mode 100644 migrations/db/env.py create mode 100644 migrations/db/script.py.mako create mode 100644 migrations/db/versions/9fab027609bc_create_scans_table.py create mode 100644 migrations/db/versions/ac05203c65bd_create_findings_table.py create mode 100644 migrations/db/versions/db4e0564e6df_create_jobs_table.py create mode 100644 migrations/migrate.py create mode 100644 migrations/pyproject.toml diff --git a/.gitignore b/.gitignore index 4c68d1b..e56864c 100644 --- a/.gitignore +++ b/.gitignore @@ -254,3 +254,9 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# Databases +*.db + +# Docker +volumes/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6aa6e7c..36e7674 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,3 +29,12 @@ repos: entry: bash -c 'docker run --rm -v "$(pwd):/workdir" -i --rm trufflesecurity/trufflehog:latest git file:///workdir --since-commit HEAD --only-verified --fail' language: system stages: ["commit", "push"] + - repo: https://github.com/hadolint/hadolint + rev: v2.10.0 + hooks: + - id: hadolint-docker + name: Lint Dockerfiles + description: Runs hadolint Docker image to lint Dockerfiles + language: docker_image + types: ["dockerfile"] + entry: ghcr.io/hadolint/hadolint hadolint diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..e886712 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,48 @@ +services: + init: + image: python:3.9 + container_name: init + volumes: + - ./migrations:/migrations:ro + environment: + - DB_URL=postgresql://myuser:mypassword@postgres/mydatabase + command: + - sh + - -c + - | + cd /migrations + pip install poetry + poetry lock --no-update + poetry install + poetry run python migrate.py + depends_on: + postgres: + condition: service_healthy + + postgres: + image: postgres:latest + container_name: postgres + environment: + POSTGRES_USER: myuser + POSTGRES_PASSWORD: mypassword + POSTGRES_DB: mydatabase + ports: + - "127.0.0.1:5432:5432" + volumes: + - ./volumes/postgres:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U myuser -d mydatabase"] + interval: 10s + timeout: 5s + retries: 3 + + adminer: + image: adminer:latest + container_name: adminer + ports: + - "127.0.0.1:8080:8080" + depends_on: + init: + condition: service_completed_successfully + postgres: + condition: service_started diff --git a/infrastructure/ingestion/aws/README.md b/infrastructure/ingestion/aws/README.md new file mode 100644 index 0000000..14f611a --- /dev/null +++ b/infrastructure/ingestion/aws/README.md @@ -0,0 +1,86 @@ +# infrastructure + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >=1.3 | +| [aws](#requirement\_aws) | ~> 5.0 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | ~> 5.0 | +| [local](#provider\_local) | n/a | +| [null](#provider\_null) | n/a | +| [random](#provider\_random) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [aws_cloudwatch_event_rule.ingestion_sfn_trigger_rule](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.ingestion_sfn_trigger](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_db_instance.rds_postgres](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_instance) | resource | +| [aws_iam_policy.policy_for_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.cloudwatch_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role.lambda_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role.sfn_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy.cloudwatch_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.sfn_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy_attachment.LambdaExecutionRolePolicyAttachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_lambda_function.ingestion-lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_function.migration-lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_secretsmanager_secret.rds_master_password](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | +| [aws_secretsmanager_secret_version.rds_master_password](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | +| [aws_security_group.lambda_sg](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | +| [aws_security_group.rds_sg](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | +| [aws_sfn_state_machine.ingestion-step-function](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sfn_state_machine) | resource | +| [null_resource.ingestion_lambda_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [null_resource.migration_lambda_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [random_password.rds_master_password](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_iam_policy_document.cloudwatch_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.cloudwatch_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.lambda_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.permissions_for_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.sf_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.sfn_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_security_group.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_group) | data source | +| [aws_subnet.selected](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/subnet) | data source | +| [aws_subnets.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/subnets) | data source | +| [aws_vpc.selected](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/vpc) | data source | +| [local_file.ingestion_lambda_build](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | +| [local_file.migration_lambda_build](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_profile](#input\_aws\_profile) | AWS profile to use for authentication | `string` | n/a | yes | +| [aws\_region](#input\_aws\_region) | AWS region where to deploy resources | `string` | n/a | yes | +| [db\_subnet\_group\_name](#input\_db\_subnet\_group\_name) | Name of the RDS subnet group | `string` | n/a | yes | +| [disable\_ingestion\_schedule](#input\_disable\_ingestion\_schedule) | Disable the ingestion schedule | `bool` | `false` | no | +| [environment\_type](#input\_environment\_type) | Environment type | `string` | n/a | yes | +| [ingestion\_schedule](#input\_ingestion\_schedule) | Cron schedule for the CloudWatch Event Rule | `string` | `"rate(24 hours)"` | no | +| [permissions\_boundary\_arn](#input\_permissions\_boundary\_arn) | ARN of the permissions boundary to use for the IAM role | `string` | n/a | yes | +| [project\_name](#input\_project\_name) | Name of the project | `string` | `"secrets-finder"` | no | +| [rds\_db\_name](#input\_rds\_db\_name) | Name of the database to create in the RDS instance | `string` | `"secrets_finder"` | no | +| [rds\_username](#input\_rds\_username) | Username for the RDS instance | `string` | `"secrets_finder"` | no | +| [s3\_bucket\_name](#input\_s3\_bucket\_name) | Name of the S3 bucket to create | `string` | n/a | yes | +| [subnet\_name](#input\_subnet\_name) | Name of the subnet where to deploy the resources (wildcards are allowed: first match is used) | `string` | n/a | yes | +| [tags](#input\_tags) | A map of tags to add to the resources | `map(string)` | n/a | yes | +| [vpc\_name](#input\_vpc\_name) | Identifier of the VPC to use for secrets-finder | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [rds\_pg\_endpoint](#output\_rds\_pg\_endpoint) | n/a | + diff --git a/infrastructure/ingestion/aws/cloudwatch.tf b/infrastructure/ingestion/aws/cloudwatch.tf new file mode 100644 index 0000000..90571d3 --- /dev/null +++ b/infrastructure/ingestion/aws/cloudwatch.tf @@ -0,0 +1,17 @@ +resource "aws_cloudwatch_event_rule" "ingestion_sfn_trigger_rule" { + name = "${var.project_name}-ingestion-sfn-trigger" + description = "Triggers the Step function on schedule" + schedule_expression = var.ingestion_schedule + state = var.disable_ingestion_schedule ? "DISABLED" : "ENABLED" +} + +resource "aws_cloudwatch_event_target" "ingestion_sfn_trigger" { + rule = aws_cloudwatch_event_rule.ingestion_sfn_trigger_rule.name + arn = aws_sfn_state_machine.ingestion-step-function.arn + role_arn = aws_iam_role.cloudwatch_role.arn + + depends_on = [ + aws_iam_role.cloudwatch_role, + aws_iam_role_policy.cloudwatch_policy, + ] +} diff --git a/infrastructure/ingestion/aws/configuration/ingestion_sfn_definition.json b/infrastructure/ingestion/aws/configuration/ingestion_sfn_definition.json new file mode 100644 index 0000000..db2f5fd --- /dev/null +++ b/infrastructure/ingestion/aws/configuration/ingestion_sfn_definition.json @@ -0,0 +1,95 @@ +{ + "Comment": "Ingestion State Machine", + "StartAt": "BootStrapState", + "States": { + "BootStrapState": { + "Type": "Task", + "Resource": "${migrate_lambda_arn}", + "Next": "IngestionState" + }, + "IngestionState": { + "Type": "Parallel", + "Branches": [ + { + "Comment": "Ingest Scheduled Scan Findings", + "StartAt": "ListScheduledScanFindingsFiles", + "States": { + "ListScheduledScanFindingsFiles": { + "Type": "Task", + "Resource": "${ingestion_lambda_arn}", + "ResultPath": "$.lambdaResult", + "Parameters": { + "action": "list_files", + "prefix": "secrets-finder/scheduled-scans/results/" + }, + "Next": "IngestScheduledScanFindingsFiles" + }, + "IngestScheduledScanFindingsFiles": { + "Type": "Map", + "ItemsPath": "$.lambdaResult.body.files", + "Parameters": { + "index.$": "$$.Map.Item.Index", + "key.$": "$$.Map.Item.Value" + }, + "Iterator": { + "StartAt": "IngestScheduledScanFindings", + "States": { + "IngestScheduledScanFindings": { + "Type": "Task", + "Resource": "${ingestion_lambda_arn}", + "Parameters": { + "action": "ingest_findings", + "file_key.$": "$.key" + }, + "End": true + } + } + }, + "End": true + } + } + }, + { + "Comment": "Ingest Ongoing Scan Findings", + "StartAt": "ListOngoingScanFindingsFiles", + "States": { + "ListOngoingScanFindingsFiles": { + "Type": "Task", + "Resource": "${ingestion_lambda_arn}", + "ResultPath": "$.lambdaResult", + "Parameters": { + "action": "list_files", + "prefix": "secrets-finder/ongoing-scans/results/" + }, + "Next": "IngestOngoingScanFindingsFiles" + }, + "IngestOngoingScanFindingsFiles": { + "Type": "Map", + "ItemsPath": "$.lambdaResult.body.files", + "Parameters": { + "index.$": "$$.Map.Item.Index", + "key.$": "$$.Map.Item.Value" + }, + "Iterator": { + "StartAt": "IngestOngoingScanFindings", + "States": { + "IngestOngoingScanFindings": { + "Type": "Task", + "Resource": "${ingestion_lambda_arn}", + "Parameters": { + "action": "ingest_findings", + "file_key.$": "$.key" + }, + "End": true + } + } + }, + "End": true + } + } + } + ], + "End": true + } + } +} diff --git a/infrastructure/ingestion/aws/iam.tf b/infrastructure/ingestion/aws/iam.tf new file mode 100644 index 0000000..15dbf9c --- /dev/null +++ b/infrastructure/ingestion/aws/iam.tf @@ -0,0 +1,160 @@ +# Lambda execution role +data "aws_iam_policy_document" "lambda_assume_role" { + statement { + effect = "Allow" + principals { + identifiers = ["lambda.amazonaws.com"] + type = "Service" + } + actions = ["sts:AssumeRole"] + } +} + +resource "aws_iam_role" "lambda_execution_role" { + name = "${var.project_name}-ingestion-lambda-execution-role" + assume_role_policy = data.aws_iam_policy_document.lambda_assume_role.json + path = "/" + permissions_boundary = var.permissions_boundary_arn +} + +data "aws_iam_policy_document" "permissions_for_execution_role" { + statement { + sid = "WriteToCloudWatchLogGroup" + effect = "Allow" + actions = [ + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + resources = ["arn:aws:logs:*:*:*"] + } + + statement { + sid = "AllowAccessToBucket" + effect = "Allow" + actions = [ + "s3:ListBucket", + "s3:GetObject", + "s3:DeleteObject" + ] + resources = [ + "${local.s3_bucket_arn}", + "${local.s3_bucket_arn}/*" + ] + } + + statement { + sid = "AllowAccessToRDS" + effect = "Allow" + actions = [ + "rds-data:ExecuteStatement", + "rds-data:BatchExecuteStatement", + "rds-data:BeginTransaction", + "rds-data:CommitTransaction", + "rds-data:RollbackTransaction" + ] + resources = [ + aws_db_instance.rds_postgres.arn + ] + } + + statement { + sid = "AllowEC2Perms" + effect = "Allow" + actions = [ + "ec2:DescribeNetworkInterfaces", + "ec2:CreateNetworkInterface", + "ec2:DeleteNetworkInterface", + "ec2:DescribeInstances", + "ec2:AttachNetworkInterface" + ] + resources = ["*"] + } +} + +resource "aws_iam_policy" "policy_for_execution_role" { + name = "${var.project_name}-ingestion-lambda-execution-role-permissions" + description = "Policy granting necessary permissions to Lambda execution instance" + policy = data.aws_iam_policy_document.permissions_for_execution_role.json +} + +resource "aws_iam_role_policy_attachment" "LambdaExecutionRolePolicyAttachment" { + policy_arn = aws_iam_policy.policy_for_execution_role.arn + role = aws_iam_role.lambda_execution_role.name +} + +# Step function role + +data "aws_iam_policy_document" "sf_assume_role" { + statement { + effect = "Allow" + principals { + identifiers = ["states.amazonaws.com"] + type = "Service" + } + actions = ["sts:AssumeRole"] + } +} + +resource "aws_iam_role" "sfn_role" { + name = "${var.project_name}-ingestion-sf-execution-role" + path = "/" + permissions_boundary = var.permissions_boundary_arn + assume_role_policy = data.aws_iam_policy_document.sf_assume_role.json +} + +data "aws_iam_policy_document" "sfn_policy_document" { + statement { + effect = "Allow" + actions = [ + "lambda:InvokeFunction" + ] + resources = [ + aws_lambda_function.ingestion-lambda.arn, + aws_lambda_function.migration-lambda.arn + ] + } +} + +resource "aws_iam_role_policy" "sfn_policy" { + name = "${var.project_name}-ingestion-sf-execution-policy" + role = aws_iam_role.sfn_role.id + policy = data.aws_iam_policy_document.sfn_policy_document.json +} + +# Cloudwatch role + +data "aws_iam_policy_document" "cloudwatch_assume_role" { + statement { + effect = "Allow" + principals { + identifiers = ["events.amazonaws.com"] + type = "Service" + } + actions = ["sts:AssumeRole"] + } +} + +resource "aws_iam_role" "cloudwatch_role" { + name = "${var.project_name}-ingestion-cloud-watch-role" + path = "/" + permissions_boundary = var.permissions_boundary_arn + assume_role_policy = data.aws_iam_policy_document.cloudwatch_assume_role.json +} + +data "aws_iam_policy_document" "cloudwatch_policy_document" { + statement { + effect = "Allow" + actions = [ + "states:StartExecution" + ] + resources = [ + aws_sfn_state_machine.ingestion-step-function.arn + ] + } +} + +resource "aws_iam_role_policy" "cloudwatch_policy" { + name = "${var.project_name}-cloudwatch-event-policy" + role = aws_iam_role.cloudwatch_role.id + policy = data.aws_iam_policy_document.cloudwatch_policy_document.json +} diff --git a/infrastructure/ingestion/aws/lambda.tf b/infrastructure/ingestion/aws/lambda.tf new file mode 100644 index 0000000..c7d7b03 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda.tf @@ -0,0 +1,100 @@ +resource "null_resource" "ingestion_lambda_build" { + provisioner "local-exec" { + command = "./package.sh" + working_dir = "${local.ingestion_lambda_dir}/" + } + + triggers = { + always_run = timestamp() + } +} + +data "local_file" "ingestion_lambda_build" { + filename = local.ingestion_lambda_archive + depends_on = [null_resource.ingestion_lambda_build] +} + +resource "aws_lambda_function" "ingestion-lambda" { + function_name = "${var.project_name}-ingestion-lambda" + role = aws_iam_role.lambda_execution_role.arn + architectures = ["arm64"] + runtime = "python3.9" + handler = "ingestion.handler" + timeout = 900 # 15 minutes + memory_size = 512 # 512 MB + filename = local.ingestion_lambda_archive + source_code_hash = data.local_file.ingestion_lambda_build.content_sha256 + + vpc_config { + subnet_ids = [data.aws_subnet.selected.id] + security_group_ids = [aws_security_group.lambda_sg.id] + } + + ephemeral_storage { + size = 1024 # 1 GB + } + + environment { + variables = { + BUCKET_NAME = var.s3_bucket_name + DB_URL = local.db_url + } + } + + depends_on = [ + data.local_file.ingestion_lambda_build, + aws_iam_role.lambda_execution_role, + aws_iam_policy.policy_for_execution_role, + aws_iam_role_policy_attachment.LambdaExecutionRolePolicyAttachment + ] +} + +resource "null_resource" "migration_lambda_build" { + provisioner "local-exec" { + command = "./package.sh" + working_dir = "${local.migration_lambda_dir}/" + } + + triggers = { + always_run = timestamp() + } +} + +data "local_file" "migration_lambda_build" { + filename = local.migration_lambda_archive + depends_on = [null_resource.migration_lambda_build] +} + +resource "aws_lambda_function" "migration-lambda" { + function_name = "${var.project_name}-migration-lambda" + role = aws_iam_role.lambda_execution_role.arn + architectures = ["arm64"] + runtime = "python3.9" + handler = "migrate.migrate" + timeout = 60 # 1 minute + memory_size = 512 # 512 MB + filename = local.migration_lambda_archive + source_code_hash = data.local_file.migration_lambda_build.content_sha256 + + vpc_config { + subnet_ids = [data.aws_subnet.selected.id] + security_group_ids = [aws_security_group.lambda_sg.id] + } + + ephemeral_storage { + size = 512 # 512 MB + } + + environment { + variables = { + DB_URL = local.db_url + } + } + + depends_on = [ + data.local_file.migration_lambda_build, + aws_iam_role.lambda_execution_role, + aws_iam_policy.policy_for_execution_role, + aws_iam_role_policy_attachment.LambdaExecutionRolePolicyAttachment + ] +} diff --git a/infrastructure/ingestion/aws/lambda/ingestion/Dockerfile b/infrastructure/ingestion/aws/lambda/ingestion/Dockerfile new file mode 100644 index 0000000..b81d74f --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/Dockerfile @@ -0,0 +1,12 @@ +FROM python@sha256:320a7a4250aba4249f458872adecf92eea88dc6abd2d76dc5c0f01cac9b53990 + +RUN pip install poetry==1.8.3 --no-cache-dir + +WORKDIR /app + +COPY . /app/ + +RUN poetry self add poetry-plugin-lambda-build \ + && poetry self add poetry-plugin-export \ + && poetry lock --no-update \ + && poetry build-lambda diff --git a/infrastructure/ingestion/aws/lambda/ingestion/README.md b/infrastructure/ingestion/aws/lambda/ingestion/README.md new file mode 100644 index 0000000..2a8579c --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/README.md @@ -0,0 +1,37 @@ +# Ingestion + +This directory contains data ingestion lambda. The Lambda is invoked by a Step Function. + +The packaging process uses the Poetry Lambda plugin and Docker to generate Lambda packages for the correct platform. This is automated when applying Terraform. + +Lambda takes a set of actions as input. Each action performs a specific function. + +## Lambda Actions + +- `list_files` : This action list files in a S3 bucket at a give prefix + Example: + ```json + { + "action": "list_files", + "prefix": "secrets-finder/scheduled-scans/results/" + } + ``` +- `ingest_findings` : This action read a given `.json` file and create new records in `findings`, `scans` and `jobs` table. Corresponding file is deleted from S3 on successful ingestion + Example: + ```json + { + "action": "ingest_findings", + "file_key": "secrets-finder/scheduled-scans/results/7eb4d1ab-ac6a-4b84-a18d-4bd944d4ef2a.json" + } + ``` + +## Add New Ingestion + +Creating a new ingestion is a 4 step process. + +1. Create necessary DB migration version under `migrations` directory. Refer [Create New Revisions](../../../../../migrations/README.md#creating-new-revision) +2. Create a new ingestion script under `modules` directory. +3. Register new ingestion with an action in `ingestion.py` under `ingestion_callback_mapping` +4. Add a new branch in [step function definition](../../configuration/ingestion_sfn_definition.json). + +Use `terraform apply` to build and deploy the Lambda. Once deployed, the next Step Function invocation will automatically trigger the new ingestion. diff --git a/infrastructure/ingestion/aws/lambda/ingestion/ingestion.py b/infrastructure/ingestion/aws/lambda/ingestion/ingestion.py new file mode 100644 index 0000000..442b9c7 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/ingestion.py @@ -0,0 +1,68 @@ +import os +from typing import List, Dict, Any, Callable, Union +import logging +from modules.common.s3 import S3 +from modules.findings_ingestion import ingest_findings + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], +) + +bucket_name: str = os.environ.get("BUCKET_NAME") +db_url: str = os.environ.get("DB_URL") + +ingestion_callback_mapping: Dict[str, Callable[[str, str, str], bool]] = { + "ingest_findings": ingest_findings +} + + +def list_files(prefix: str) -> Dict[str, Union[int, Dict[str, List[str]]]]: + s3 = S3(bucket_name) + files = s3.list_files(prefix) + return {"statusCode": 200, "body": {"files": files}} + + +def handler(event: Dict[str, Any], _) -> Dict[str, Any]: + """ + Handle the Lambda function invocation. + + Args: + event (Dict[str, Any]): The event data passed to the Lambda function. + _ (Any): The context object representing the runtime information. + + Returns: + Dict[str, Any]: The response data returned by the Lambda function. + + Raises: + ValueError: If the request is invalid or the action is not supported. + """ + action: str = event.get("action") + + if action == "list_files": + prefix: str = event.get("prefix") + if not prefix: + logging.error("missing prefix in request for action list_files") + raise ValueError("Invalid request") + + response: Dict[str, Union[int, Dict[str, List[str]]]] = list_files(prefix) + return response + + elif action in ingestion_callback_mapping: + file_key: str = event.get("file_key") + if not file_key: + logging.error("missing file_key in request for action ingest_findings") + raise ValueError("Invalid request") + + status: bool = ingestion_callback_mapping[action](db_url, bucket_name, file_key) + + if not status: + logging.error("Error ingesting data") + raise ValueError("Error ingesting data") + + return {"statusCode": 200, "body": {"success": status}} + + else: + logging.error(f"Invalid action: {action}") + raise ValueError("Invalid request") diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/__init__.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/common/__init__.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/common/s3.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/s3.py new file mode 100644 index 0000000..a512dd6 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/s3.py @@ -0,0 +1,123 @@ +import os +import tempfile +import boto3 +from typing import List, Tuple + + +class S3: + """ + Represents an S3 client for interacting with an S3 bucket. + + Args: + bucket_name (str): The name of the S3 bucket. + + Attributes: + client (boto3.client): The S3 client. + bucket_name (str): The name of the S3 bucket. + + """ + + client: boto3.client = None + bucket_name: str = None + + def __init__(self, bucket_name: str) -> None: + """ + Initializes the S3 client. + + Args: + bucket_name (str): The name of the S3 bucket. + + """ + self.client = boto3.client("s3") + self.bucket_name = bucket_name + + def list_files(self, prefix: str) -> List[str]: + """ + Lists all the files in the S3 bucket with the specified prefix. + + Args: + prefix (str): The prefix to filter the files. + + Returns: + List[str]: A list of file keys. + + """ + keys: List[str] = [] + continuation_token: str = None + + if not prefix.endswith("/"): + prefix += "/" + + while True: + kwargs: dict = { + "Bucket": self.bucket_name, + "Prefix": prefix, + "Delimiter": "/", + } + + if continuation_token: + kwargs["ContinuationToken"] = continuation_token + + response: dict = self.client.list_objects_v2(**kwargs) + contents: List[dict] = response.get("Contents", []) + _keys: List[str] = [ + content["Key"] + for content in contents + if not content["Key"].endswith("/") + ] + keys.extend(_keys) + + if not response.get("IsTruncated"): + break + + continuation_token = response.get("NextContinuationToken") + + return keys + + def download_file(self, file_key: str) -> str: + """ + Downloads the file with the specified key from the bucket. + + Args: + file_key (str): The key of the file to download. + + Returns: + str: The local path of the downloaded file. + + """ + file_name: str = os.path.basename(file_key) + local_path: str = os.path.join(tempfile.gettempdir(), file_name) + self.client.download_file(self.bucket_name, file_key, local_path) + return local_path + + def download_first_file(self, prefix: str) -> Tuple[str, str]: + """ + Downloads the first file with the specified prefix from the bucket. + + Args: + prefix (str): The prefix to filter the files. + + Returns: + Tuple[str, str]: A tuple containing the file key and the local path of the downloaded file. + + """ + files = self.list_files(prefix) + if not files: + return None + + key = files[0] + return key, self.download_file(files[0]) + + def delete_file(self, file_key: str) -> bool: + """ + Deletes the file with the specified key from the bucket. + + Args: + file_key (str): The key of the file to delete. + + Returns: + bool: True if the file was successfully deleted, False otherwise. + + """ + self.client.delete_object(Bucket=self.bucket_name, Key=file_key) + return True diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/common/utils.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/utils.py new file mode 100644 index 0000000..c61ddd0 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/utils.py @@ -0,0 +1 @@ +DATE_TIME_FORMAT: str = "%Y-%m-%dT%H:%M:%S.%f" diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/findings_ingestion.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/findings_ingestion.py new file mode 100644 index 0000000..9246eb9 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/modules/findings_ingestion.py @@ -0,0 +1,197 @@ +import datetime +import json +import logging +from sqlalchemy import ( + JSON, + VARCHAR, + Boolean, + Column, + Integer, + String, + DateTime, + create_engine, +) +from sqlalchemy.orm import sessionmaker, declarative_base +from modules.common.s3 import S3 +from modules.common.utils import DATE_TIME_FORMAT +import uuid + +Base = declarative_base() + + +class Finding(Base): + __tablename__ = "findings" + uuid = Column(String, primary_key=True) + scan_uuid = Column(String, nullable=False) + job_uuid = Column(String, nullable=False) + organization = Column(String, nullable=True) + scan_context = Column(String, nullable=False) + created_on = Column(DateTime, nullable=False) + decoder_name = Column(String, nullable=False) + detector_name = Column(String, nullable=False) + detector_type = Column(Integer, nullable=False) + raw = Column(VARCHAR, nullable=False) + raw_v2 = Column(VARCHAR, nullable=True) + redacted = Column(String, nullable=True) + source_name = Column(String, nullable=False) + source_type = Column(Integer, nullable=False) + verified = Column(Boolean, nullable=False) + extra_data = Column(JSON, nullable=True) + repository = Column(String, nullable=True) + filename = Column(String, nullable=False) + commit_hash = Column(String, nullable=True) + committer_email = Column(String, nullable=True) + commit_timestamp = Column(DateTime, nullable=True) + line_number = Column(Integer, nullable=False) + is_still_valid = Column(Boolean, nullable=False) + last_validated_on = Column(DateTime, nullable=False) + + +class Scans(Base): + __tablename__ = "scans" + uuid = Column(String, primary_key=True) + job_uuid = Column(String, nullable=False) + scan_identifier = Column(String, nullable=True) + scm = Column(String, nullable=False) + organization = Column(String, nullable=True) + repository = Column(String, nullable=False) + scan_context = Column(String, nullable=False) + started_on = Column(DateTime, nullable=False) + completed_on = Column(DateTime, nullable=False) + status = Column(Integer, nullable=False) + scan_mode = Column(String, nullable=False) + scan_type = Column(String, nullable=False) + # metadata is a reserved attribute name in SQLAlchemy + metadata_ = Column("metadata", JSON, nullable=True) + + +class Jobs(Base): + __tablename__ = "jobs" + uuid = Column(String, primary_key=True) + scan_identifier = Column(String, nullable=False) + scm = Column(String, nullable=False) + scan_context = Column(String, nullable=False) + started_on = Column(DateTime, nullable=False) + completed_on = Column(DateTime, nullable=False) + status = Column(Integer, nullable=False) + scan_mode = Column(String, nullable=False) + scan_type = Column(String, nullable=False) + + +def ingest_findings(db_url: str, bucket_name: str, file_key: str) -> bool: + """ + Ingests findings from a file downloaded from S3 into a database. + + Args: + db_url (str): The URL of the database to connect to. + bucket_name (str): The name of the S3 bucket. + file_key (str): The key of the file in the S3 bucket. + + Returns: + bool: True if the ingestion is successful, False otherwise. + """ + logging.info(f"Downloading file from S3, key: {file_key}, bucket: {bucket_name}") + s3 = S3(bucket_name) + file_path = s3.download_file(file_key) + logging.info(f"File downloaded to {file_path}, key: {file_key}") + + with open(file_path, "r") as file: + data = json.load(file) + + if not data: + logging.error("No data in the file") + return False + + # Create a SQLAlchemy engine to connect to the database + engine = create_engine(db_url) + + # Create a session + Session = sessionmaker(bind=engine) + session = Session() + + job = Jobs( + uuid=data["scan_uuid"], + scan_identifier=data["scan_identifier"], + scm=data["scm"], + scan_context=data["scan_context"], + started_on=datetime.datetime.strptime(data["start"], DATE_TIME_FORMAT), + completed_on=datetime.datetime.strptime(data["end"], DATE_TIME_FORMAT), + status=data["status"], + scan_type=data["scan_type"], + scan_mode=data["scan_mode"], + ) + + session.add(job) + + for result in data.get("results", []): + scan = Scans( + uuid=result["scan_uuid"], + job_uuid=job.uuid, + scan_identifier=job.scan_identifier, + scm=job.scm, + organization=result["organization"], + repository=result["repository"], + scan_context=job.scan_context, + started_on=datetime.datetime.strptime(result["start"], DATE_TIME_FORMAT), + completed_on=datetime.datetime.strptime(result["end"], DATE_TIME_FORMAT), + status=result.get("status"), + scan_mode=job.scan_mode, + scan_type=job.scan_type, + metadata_=result.get("metadata", {}), + ) + + logging.info(f'Ingesting scan: {result["scan_uuid"]}') + session.add(scan) + + for finding in result.get("findings", []): + source_meta_data = list( + finding.get("SourceMetadata", {}).get("Data", {}).values() + )[0] + finding = Finding( + uuid=str(uuid.uuid4()), + scan_uuid=result["scan_uuid"], + job_uuid=job.uuid, + organization=result["organization"], + scan_context=job.scan_context, + created_on=datetime.datetime.now(), + decoder_name=finding["DetectorName"], + detector_name=finding["DetectorName"], + detector_type=finding["DetectorType"], + raw=finding["Raw"], + raw_v2=finding.get("RawV2", ""), + redacted=finding.get("Redacted", ""), + source_name=finding["SourceName"], + source_type=finding["SourceType"], + verified=finding["Verified"], + extra_data=finding.get("ExtraData", {}), + repository=result["repository"], + filename=source_meta_data["file"], + commit_hash=source_meta_data.get("commit"), + committer_email=source_meta_data.get("email"), + commit_timestamp=( + datetime.datetime.strptime( + source_meta_data.get("timestamp"), "%Y-%m-%d %H:%M:%S %z" + ) + if source_meta_data.get("timestamp") + else None + ), + line_number=source_meta_data["line"], + is_still_valid=finding["Verified"], + last_validated_on=datetime.datetime.strptime( + result["end"], DATE_TIME_FORMAT + ), + ) + + logging.info( + f'Ingesting finding: {finding.uuid} for scan: {result["scan_uuid"]}' + ) + session.add(finding) + + if not s3.delete_file(file_key): + logging.error(f"Error deleting file from S3, key: {file_key}") + session.rollback() + return False + + logging.info(f"Deleted file from S3, key: {file_key}") + session.commit() + return True diff --git a/infrastructure/ingestion/aws/lambda/ingestion/package.sh b/infrastructure/ingestion/aws/lambda/ingestion/package.sh new file mode 100755 index 0000000..2866648 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/package.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +docker build -t ingestion-lambda --platform=linux/arm64 -f Dockerfile . + +docker run --rm -v $(pwd):/output ingestion-lambda cp /app/ingestion.zip /output/ diff --git a/infrastructure/ingestion/aws/lambda/ingestion/pyproject.toml b/infrastructure/ingestion/aws/lambda/ingestion/pyproject.toml new file mode 100644 index 0000000..e1c9049 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/ingestion/pyproject.toml @@ -0,0 +1,21 @@ +[tool.poetry] +name = "ingestion" +version = "0.1.0" +description = "Lambda to ingest data into the data lake" +authors = ["Thomson Reuters "] +license = "mit" +readme = "README.md" +include = ["modules/*.py", "modules/common/*"] + +[tool.poetry.dependencies] +python = "^3.9" +boto3 = "^1.34.130" +sqlalchemy = "^2.0.31" +psycopg2-binary = "^2.9.9" + +[tool.poetry-plugin-lambda-build] +package_artifact_path = "ingestion.zip" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/infrastructure/ingestion/aws/lambda/migration/Dockerfile b/infrastructure/ingestion/aws/lambda/migration/Dockerfile new file mode 100644 index 0000000..b81d74f --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/migration/Dockerfile @@ -0,0 +1,12 @@ +FROM python@sha256:320a7a4250aba4249f458872adecf92eea88dc6abd2d76dc5c0f01cac9b53990 + +RUN pip install poetry==1.8.3 --no-cache-dir + +WORKDIR /app + +COPY . /app/ + +RUN poetry self add poetry-plugin-lambda-build \ + && poetry self add poetry-plugin-export \ + && poetry lock --no-update \ + && poetry build-lambda diff --git a/infrastructure/ingestion/aws/lambda/migration/README.md b/infrastructure/ingestion/aws/lambda/migration/README.md new file mode 100644 index 0000000..3e2206c --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/migration/README.md @@ -0,0 +1,16 @@ +# Migration Lambda + +This directory contains all the necessary scripts to package Migrations (located at the root level) as a Lambda function. + +The packaging process uses the Poetry Lambda plugin and leverages Docker to ensure Lambda packages are generated for the correct platform. + +## Usage + +To package lambda, run following command + +```bash +./package.sh +``` + +> [!NOTE] +> Any changes in Migrations should be automatically detected during the repackaging process. diff --git a/infrastructure/ingestion/aws/lambda/migration/package.sh b/infrastructure/ingestion/aws/lambda/migration/package.sh new file mode 100755 index 0000000..3ff9917 --- /dev/null +++ b/infrastructure/ingestion/aws/lambda/migration/package.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +ROOT_DIR=$(git rev-parse --show-toplevel) + +docker build -t migrate-lambda --platform=linux/arm64 -f Dockerfile $ROOT_DIR/migrations + +docker run --rm -v $(pwd):/output migrate-lambda cp /app/migration.zip /output/ diff --git a/infrastructure/ingestion/aws/locals.tf b/infrastructure/ingestion/aws/locals.tf new file mode 100644 index 0000000..ebf70ba --- /dev/null +++ b/infrastructure/ingestion/aws/locals.tf @@ -0,0 +1,11 @@ +locals { + environment = replace(lower(var.environment_type), " ", "-") + db_url = "postgresql://${var.rds_username}:${random_password.rds_master_password.result}@${aws_db_instance.rds_postgres.address}" + configuration_dir = "${path.module}/configuration" + ingestion_lambda_dir = "${path.module}/lambda/ingestion" + ingestion_lambda_archive = "${local.ingestion_lambda_dir}/ingestion.zip" + migration_lambda_dir = "${path.module}/lambda/migration" + migration_lambda_archive = "${local.migration_lambda_dir}/migration.zip" + s3_bucket_arn = "arn:aws:s3:::${var.s3_bucket_name}" + tags = var.tags +} diff --git a/infrastructure/ingestion/aws/outputs.tf b/infrastructure/ingestion/aws/outputs.tf new file mode 100644 index 0000000..185e483 --- /dev/null +++ b/infrastructure/ingestion/aws/outputs.tf @@ -0,0 +1,3 @@ +output "rds_pg_endpoint" { + value = aws_db_instance.rds_postgres.endpoint +} diff --git a/infrastructure/ingestion/aws/providers.tf b/infrastructure/ingestion/aws/providers.tf new file mode 100644 index 0000000..d3d1e83 --- /dev/null +++ b/infrastructure/ingestion/aws/providers.tf @@ -0,0 +1,23 @@ +terraform { + required_version = ">=1.3" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } + + backend "s3" { + encrypt = true + } +} + +provider "aws" { + region = var.aws_region + profile = var.aws_profile + + default_tags { + tags = local.tags + } +} diff --git a/infrastructure/ingestion/aws/rds.tf b/infrastructure/ingestion/aws/rds.tf new file mode 100644 index 0000000..a133b4f --- /dev/null +++ b/infrastructure/ingestion/aws/rds.tf @@ -0,0 +1,18 @@ +resource "aws_db_instance" "rds_postgres" { + identifier = "${var.project_name}-rds-postgres" + allocated_storage = 10 + engine = "postgres" + engine_version = "16.3" + instance_class = "db.t3.micro" # Smallest instance type for PostgreSQL + username = var.rds_username + password = random_password.rds_master_password.result + parameter_group_name = "default.postgres16" + db_name = var.rds_db_name + skip_final_snapshot = true + publicly_accessible = false + storage_encrypted = true + deletion_protection = true + backup_retention_period = 7 + vpc_security_group_ids = [aws_security_group.rds_sg.id] + db_subnet_group_name = var.db_subnet_group_name +} diff --git a/infrastructure/ingestion/aws/s3.tfbackend b/infrastructure/ingestion/aws/s3.tfbackend new file mode 100644 index 0000000..6fa4016 --- /dev/null +++ b/infrastructure/ingestion/aws/s3.tfbackend @@ -0,0 +1,5 @@ +bucket = "" +key = "" +region = "" +dynamodb_table = "" +profile = "" diff --git a/infrastructure/ingestion/aws/secrets.tf b/infrastructure/ingestion/aws/secrets.tf new file mode 100644 index 0000000..e43778b --- /dev/null +++ b/infrastructure/ingestion/aws/secrets.tf @@ -0,0 +1,19 @@ +resource "random_password" "rds_master_password" { + length = 40 + special = true + min_special = 5 + override_special = "!#$%^&*()-_=+[]{}<>:?" + keepers = { + pass_version = 1 + } +} + +resource "aws_secretsmanager_secret" "rds_master_password" { + name = "${var.project_name}-rds-master-password" + description = "Master password for RDS instance" +} + +resource "aws_secretsmanager_secret_version" "rds_master_password" { + secret_id = aws_secretsmanager_secret.rds_master_password.id + secret_string = random_password.rds_master_password.result +} diff --git a/infrastructure/ingestion/aws/securitygroups.tf b/infrastructure/ingestion/aws/securitygroups.tf new file mode 100644 index 0000000..9e23393 --- /dev/null +++ b/infrastructure/ingestion/aws/securitygroups.tf @@ -0,0 +1,33 @@ +resource "aws_security_group" "rds_sg" { + name = "${var.project_name}-rds-sg" + description = "Security group for RDS instance" + vpc_id = data.aws_vpc.selected.id + + ingress { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + cidr_blocks = [data.aws_vpc.selected.cidr_block] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + + +resource "aws_security_group" "lambda_sg" { + name = "${var.project_name}-lambda-sg" + description = "Security group for Lambda functions" + vpc_id = data.aws_vpc.selected.id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} diff --git a/infrastructure/ingestion/aws/sfn.tf b/infrastructure/ingestion/aws/sfn.tf new file mode 100644 index 0000000..540f09e --- /dev/null +++ b/infrastructure/ingestion/aws/sfn.tf @@ -0,0 +1,13 @@ +resource "aws_sfn_state_machine" "ingestion-step-function" { + name = "${var.project_name}-ingestion-step-function" + role_arn = aws_iam_role.sfn_role.arn + definition = templatefile("${local.configuration_dir}/ingestion_sfn_definition.json", { + migrate_lambda_arn = "${aws_lambda_function.migration-lambda.arn}", + ingestion_lambda_arn = "${aws_lambda_function.ingestion-lambda.arn}" + }) + + depends_on = [ + aws_iam_role.sfn_role, + aws_iam_role_policy.sfn_policy, + ] +} diff --git a/infrastructure/ingestion/aws/sts.tf b/infrastructure/ingestion/aws/sts.tf new file mode 100644 index 0000000..8fc4b38 --- /dev/null +++ b/infrastructure/ingestion/aws/sts.tf @@ -0,0 +1 @@ +data "aws_caller_identity" "current" {} diff --git a/infrastructure/ingestion/aws/terraform.tfvars.example b/infrastructure/ingestion/aws/terraform.tfvars.example new file mode 100644 index 0000000..85c8a35 --- /dev/null +++ b/infrastructure/ingestion/aws/terraform.tfvars.example @@ -0,0 +1,17 @@ +aws_region = "" +aws_profile = "" +environment_type = "" +project_name = "" +vpc_name = "" +subnet_name = "" +db_subnet_group_name = "" +permissions_boundary_arn = "" +s3_bucket_name = "" +tags = { + "mytag" = "tag" + "mytag2" = "tag2" +} +rds_username = "" +rds_db_name = "" +ingestion_schedule = "" +disable_ingestion_schedule = "" diff --git a/infrastructure/ingestion/aws/variables.tf b/infrastructure/ingestion/aws/variables.tf new file mode 100644 index 0000000..5b1b594 --- /dev/null +++ b/infrastructure/ingestion/aws/variables.tf @@ -0,0 +1,115 @@ +variable "aws_region" { + type = string + description = "AWS region where to deploy resources" + + validation { + condition = can(regex("^(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\\d+$", var.aws_region)) + error_message = "You should enter a valid AWS region (https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.RegionsAndAvailabilityZones.html)" + } +} + +variable "aws_profile" { + type = string + description = "AWS profile to use for authentication" +} + +variable "environment_type" { + type = string + description = "Environment type" + + validation { + condition = contains(["PRODUCTION", "PRE-PRODUCTION", "QUALITY ASSURANCE", "INTEGRATION TESTING", "DEVELOPMENT", "LAB"], var.environment_type) + error_message = "The environment type should be one of the following values: PRODUCTION, PRE-PRODUCTION, QUALITY ASSURANCE, INTEGRATION TESTING, DEVELOPMENT, LAB (case sensitive)" + } +} + +variable "vpc_name" { + type = string + description = "Identifier of the VPC to use for secrets-finder" +} + +variable "subnet_name" { + type = string + description = "Name of the subnet where to deploy the resources (wildcards are allowed: first match is used)" +} + +variable "db_subnet_group_name" { + type = string + description = "Name of the RDS subnet group" +} + +variable "tags" { + type = map(string) + description = "A map of tags to add to the resources" + + validation { + condition = alltrue([for v in values(var.tags) : v != ""]) + error_message = "Tag values must not be empty." + } +} + +variable "project_name" { + type = string + description = "Name of the project" + default = "secrets-finder" +} + +variable "permissions_boundary_arn" { + type = string + description = "ARN of the permissions boundary to use for the IAM role" + + validation { + condition = can(regex("^arn:aws:iam::[0-9]{12}:policy\\/([a-zA-Z0-9-_.]+)$", var.permissions_boundary_arn)) + error_message = "The provided ARN is not a valid ARN for a policy" + } +} + +variable "s3_bucket_name" { + type = string + description = "Name of the S3 bucket to create" + + validation { + condition = can(regex("^[a-z0-9.-]{3,63}$", var.s3_bucket_name)) + error_message = "The S3 bucket name must be a valid string with only a-z0-9.- characters and have a length between 3 and 63" + } +} + +variable "rds_username" { + type = string + description = "Username for the RDS instance" + default = "secrets_finder" + + validation { + condition = can(regex("^[a-z][a-z0-9_]{1,}$", var.rds_username)) + error_message = "The RDS username must be a valid string with only a-z0-9_ characters, have a length greater than 1, and not start with a number" + } +} + + +variable "rds_db_name" { + type = string + description = "Name of the database to create in the RDS instance" + default = "secrets_finder" + + validation { + condition = can(regex("^[a-z][a-z0-9_]{1,}$", var.rds_db_name)) + error_message = "The RDS database name must be a valid string with only a-z0-9_ characters, have a length greater than 1, and not start with a number" + } +} + +variable "ingestion_schedule" { + type = string + description = "Cron schedule for the CloudWatch Event Rule" + default = "rate(24 hours)" + + validation { + condition = can(regex("^(rate|cron)\\(\\d+ (minutes|hours|days)\\)$", var.ingestion_schedule)) + error_message = "The ingestion schedule should be in the format 'rate(n minutes|hours|days)' or 'cron(expression)', where n is a positive integer" + } +} + +variable "disable_ingestion_schedule" { + type = bool + description = "Disable the ingestion schedule" + default = false +} diff --git a/infrastructure/ingestion/aws/vpc.tf b/infrastructure/ingestion/aws/vpc.tf new file mode 100644 index 0000000..041279b --- /dev/null +++ b/infrastructure/ingestion/aws/vpc.tf @@ -0,0 +1,22 @@ +data "aws_vpc" "selected" { + filter { + name = "tag:Name" + values = [var.vpc_name] + } +} + +data "aws_subnets" "default" { + filter { + name = "tag:Name" + values = [var.subnet_name] + } +} + +data "aws_subnet" "selected" { + id = element(sort(data.aws_subnets.default.ids), 0) +} + +data "aws_security_group" "default" { + vpc_id = data.aws_vpc.selected.id + name = "default" +} diff --git a/migrations/README.md b/migrations/README.md new file mode 100644 index 0000000..d97f2bf --- /dev/null +++ b/migrations/README.md @@ -0,0 +1,51 @@ +# Migrations + +This directory contains alembic configuration and database versions to manage database migrations. Migrations are compatible with SQLite, Postgres & MariaDB. + +## Files + +- `alembic.ini`: Contains alembic configuration. + +- `migrate.py`: A wrapper script to programmatically run migrations. Example: using Lambdas. + +- `pyproject.toml`: This is the configuration file for the `poetry` package manager, which is used to manage dependencies for the script. + +- `db`: This directory contains alembic env config and versions. + +## Usage + +1. To run, setup the DB_URL environment variables. DB_URL environment variable value should be FQDN. Example: `postgresql://myuser:mypassword@127.0.0.1:5432/mydatabase` +2. Setup the poetry environment + ```bash + poetry install + ``` +3. Run migration + ```bash + poetry run alembic upgrade head + ``` + +## Creating New Revisions + +New migration revisions are needed whenever there are modifications to the database schema, such as adding a new table, adding a new column, or updating an existing column. + +1. Setup the poetry environment, if not already done + ```bash + poetry install + ``` +2. Run command. Provide a comment for the revision + ```bash + poetry run alembic revision -m "" + ``` + This command will create a new revision file under `db/revisions`. Complete definition for `upgrade` and `downgrade` function. + +## Requirements + +- Python 3.9 or higher +- `poetry` package manager +- Connectivity to the database + > By default the database created will be private, you may have to run this script from a compute resource that is authorized to connect + +## Dependencies + +- alembic = "^1.13.1" +- psycopg2-binary = "^2.9.9" diff --git a/migrations/alembic.ini b/migrations/alembic.ini new file mode 100644 index 0000000..f40a6e1 --- /dev/null +++ b/migrations/alembic.ini @@ -0,0 +1,116 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = db + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to db/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:db/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = %(DB_URL)s + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/migrations/db/env.py b/migrations/db/env.py new file mode 100644 index 0000000..dfd6787 --- /dev/null +++ b/migrations/db/env.py @@ -0,0 +1,79 @@ +from logging.config import fileConfig +import os + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +config.set_main_option("DB_URL", os.environ["DB_URL"]) + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = None + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure(connection=connection, target_metadata=target_metadata) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/migrations/db/script.py.mako b/migrations/db/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/migrations/db/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/migrations/db/versions/9fab027609bc_create_scans_table.py b/migrations/db/versions/9fab027609bc_create_scans_table.py new file mode 100644 index 0000000..18f78b7 --- /dev/null +++ b/migrations/db/versions/9fab027609bc_create_scans_table.py @@ -0,0 +1,42 @@ +"""Create scans table + +Revision ID: 9fab027609bc +Revises: ac05203c65bd +Create Date: 2024-06-17 20:29:24.694581 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "9fab027609bc" +down_revision: Union[str, None] = "ac05203c65bd" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "scans", + sa.Column("uuid", sa.String(), primary_key=True), + sa.Column("job_uuid", sa.String(), nullable=False), + sa.Column("scan_identifier", sa.String(), nullable=False), + sa.Column("scm", sa.String(), nullable=False), + sa.Column("organization", sa.String(), nullable=True), + sa.Column("repository", sa.String(), nullable=False), + sa.Column("scan_context", sa.String(), nullable=False), + sa.Column("started_on", sa.DateTime(), nullable=False), + sa.Column("completed_on", sa.DateTime(), nullable=False), + sa.Column("status", sa.String(), nullable=False), + sa.Column("scan_mode", sa.String(), nullable=True), + sa.Column("scan_type", sa.String(), nullable=False), + sa.Column("metadata", sa.JSON(), nullable=True), + ) + + +def downgrade() -> None: + op.drop_table("scans") diff --git a/migrations/db/versions/ac05203c65bd_create_findings_table.py b/migrations/db/versions/ac05203c65bd_create_findings_table.py new file mode 100644 index 0000000..2d29cde --- /dev/null +++ b/migrations/db/versions/ac05203c65bd_create_findings_table.py @@ -0,0 +1,53 @@ +"""Create findings/secrets table + +Revision ID: ac05203c65bd +Revises: +Create Date: 2024-06-17 18:59:55.247810 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "ac05203c65bd" +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "findings", + sa.Column("uuid", sa.String(), primary_key=True), + sa.Column("scan_uuid", sa.String(), nullable=False), + sa.Column("job_uuid", sa.String(), nullable=False), + sa.Column("organization", sa.String(), nullable=True), + sa.Column("scan_context", sa.String(), nullable=False), + sa.Column("created_on", sa.DateTime(), nullable=False), + sa.Column("decoder_name", sa.String(), nullable=False), + sa.Column("detector_name", sa.String(), nullable=False), + sa.Column("detector_type", sa.Integer(), nullable=False), + sa.Column("raw", sa.VARCHAR(), nullable=False), + sa.Column("raw_v2", sa.VARCHAR(), nullable=True), + sa.Column("redacted", sa.String(), nullable=True), + sa.Column("source_name", sa.String(), nullable=False), + sa.Column("source_type", sa.Integer(), nullable=False), + sa.Column("verified", sa.Boolean(), nullable=False), + sa.Column("extra_data", sa.JSON(), nullable=True), + sa.Column("repository", sa.String(), nullable=True), + sa.Column("filename", sa.String(), nullable=False), + sa.Column("commit_hash", sa.String(), nullable=True), + sa.Column("committer_email", sa.String(), nullable=True), + sa.Column("commit_timestamp", sa.DateTime(), nullable=True), + sa.Column("line_number", sa.Integer(), nullable=False), + sa.Column("is_still_valid", sa.Boolean(), nullable=False), + sa.Column("last_validated_on", sa.DateTime(), nullable=False), + ) + + +def downgrade() -> None: + op.drop_table("findings") diff --git a/migrations/db/versions/db4e0564e6df_create_jobs_table.py b/migrations/db/versions/db4e0564e6df_create_jobs_table.py new file mode 100644 index 0000000..7b18ff4 --- /dev/null +++ b/migrations/db/versions/db4e0564e6df_create_jobs_table.py @@ -0,0 +1,38 @@ +"""create jobs table + +Revision ID: db4e0564e6df +Revises: 9fab027609bc +Create Date: 2024-06-22 13:09:36.346009 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "db4e0564e6df" +down_revision: Union[str, None] = "9fab027609bc" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "jobs", + sa.Column("uuid", sa.String(), primary_key=True), + sa.Column("scan_identifier", sa.String(), nullable=False), + sa.Column("scm", sa.String(), nullable=False), + sa.Column("scan_context", sa.String(), nullable=False), + sa.Column("started_on", sa.DateTime(), nullable=False), + sa.Column("completed_on", sa.DateTime(), nullable=False), + sa.Column("status", sa.String(), nullable=False), + sa.Column("scan_mode", sa.String(), nullable=True), + sa.Column("scan_type", sa.String(), nullable=False), + ) + + +def downgrade() -> None: + op.drop_table("jobs") diff --git a/migrations/migrate.py b/migrations/migrate.py new file mode 100644 index 0000000..0e71f46 --- /dev/null +++ b/migrations/migrate.py @@ -0,0 +1,24 @@ +import logging +from alembic.config import Config +from alembic import command + + +def migrate(event, context): + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], + ) + + logging.info("Starting Alembic upgrade") + # Set up the Alembic configuration + alembic_cfg = Config("alembic.ini") + + # Run the Alembic upgrade command + command.upgrade(alembic_cfg, "head") + + return {"statusCode": 200, "body": "Alembic upgrade successful!"} + + +if __name__ == "__main__": + migrate({}, {}) diff --git a/migrations/pyproject.toml b/migrations/pyproject.toml new file mode 100644 index 0000000..1b641e7 --- /dev/null +++ b/migrations/pyproject.toml @@ -0,0 +1,20 @@ +[tool.poetry] +name = "migrate" +version = "0.1.0" +description = "DB configuration and migration scripts" +authors = ["Thomson Reuters "] +license = "MIT" +readme = "README.md" +include = ["alembic.ini", "db/*", "db/versions/*"] + +[tool.poetry.dependencies] +python = "^3.9" +alembic = "^1.13.1" +psycopg2-binary = "^2.9.9" + +[tool.poetry-plugin-lambda-build] +package_artifact_path = "migration.zip" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api"