diff --git a/.gitignore b/.gitignore
index 4c68d1b..e56864c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -254,3 +254,9 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
+
+# Databases
+*.db
+
+# Docker
+volumes/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6aa6e7c..36e7674 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,3 +29,12 @@ repos:
entry: bash -c 'docker run --rm -v "$(pwd):/workdir" -i --rm trufflesecurity/trufflehog:latest git file:///workdir --since-commit HEAD --only-verified --fail'
language: system
stages: ["commit", "push"]
+ - repo: https://github.com/hadolint/hadolint
+ rev: v2.10.0
+ hooks:
+ - id: hadolint-docker
+ name: Lint Dockerfiles
+ description: Runs hadolint Docker image to lint Dockerfiles
+ language: docker_image
+ types: ["dockerfile"]
+ entry: ghcr.io/hadolint/hadolint hadolint
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 0000000..e886712
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,48 @@
+services:
+ init:
+ image: python:3.9
+ container_name: init
+ volumes:
+ - ./migrations:/migrations:ro
+ environment:
+ - DB_URL=postgresql://myuser:mypassword@postgres/mydatabase
+ command:
+ - sh
+ - -c
+ - |
+ cd /migrations
+ pip install poetry
+ poetry lock --no-update
+ poetry install
+ poetry run python migrate.py
+ depends_on:
+ postgres:
+ condition: service_healthy
+
+ postgres:
+ image: postgres:latest
+ container_name: postgres
+ environment:
+ POSTGRES_USER: myuser
+ POSTGRES_PASSWORD: mypassword
+ POSTGRES_DB: mydatabase
+ ports:
+ - "127.0.0.1:5432:5432"
+ volumes:
+ - ./volumes/postgres:/var/lib/postgresql/data
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U myuser -d mydatabase"]
+ interval: 10s
+ timeout: 5s
+ retries: 3
+
+ adminer:
+ image: adminer:latest
+ container_name: adminer
+ ports:
+ - "127.0.0.1:8080:8080"
+ depends_on:
+ init:
+ condition: service_completed_successfully
+ postgres:
+ condition: service_started
diff --git a/infrastructure/ingestion/aws/README.md b/infrastructure/ingestion/aws/README.md
new file mode 100644
index 0000000..14f611a
--- /dev/null
+++ b/infrastructure/ingestion/aws/README.md
@@ -0,0 +1,86 @@
+# infrastructure
+
+
+## Requirements
+
+| Name | Version |
+|------|---------|
+| [terraform](#requirement\_terraform) | >=1.3 |
+| [aws](#requirement\_aws) | ~> 5.0 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| [aws](#provider\_aws) | ~> 5.0 |
+| [local](#provider\_local) | n/a |
+| [null](#provider\_null) | n/a |
+| [random](#provider\_random) | n/a |
+
+## Modules
+
+No modules.
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [aws_cloudwatch_event_rule.ingestion_sfn_trigger_rule](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource |
+| [aws_cloudwatch_event_target.ingestion_sfn_trigger](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource |
+| [aws_db_instance.rds_postgres](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_instance) | resource |
+| [aws_iam_policy.policy_for_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
+| [aws_iam_role.cloudwatch_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
+| [aws_iam_role.lambda_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
+| [aws_iam_role.sfn_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
+| [aws_iam_role_policy.cloudwatch_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
+| [aws_iam_role_policy.sfn_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
+| [aws_iam_role_policy_attachment.LambdaExecutionRolePolicyAttachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
+| [aws_lambda_function.ingestion-lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
+| [aws_lambda_function.migration-lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
+| [aws_secretsmanager_secret.rds_master_password](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
+| [aws_secretsmanager_secret_version.rds_master_password](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource |
+| [aws_security_group.lambda_sg](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource |
+| [aws_security_group.rds_sg](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource |
+| [aws_sfn_state_machine.ingestion-step-function](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sfn_state_machine) | resource |
+| [null_resource.ingestion_lambda_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+| [null_resource.migration_lambda_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+| [random_password.rds_master_password](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource |
+| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
+| [aws_iam_policy_document.cloudwatch_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.cloudwatch_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.lambda_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.permissions_for_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.sf_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.sfn_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_security_group.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_group) | data source |
+| [aws_subnet.selected](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/subnet) | data source |
+| [aws_subnets.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/subnets) | data source |
+| [aws_vpc.selected](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/vpc) | data source |
+| [local_file.ingestion_lambda_build](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source |
+| [local_file.migration_lambda_build](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| [aws\_profile](#input\_aws\_profile) | AWS profile to use for authentication | `string` | n/a | yes |
+| [aws\_region](#input\_aws\_region) | AWS region where to deploy resources | `string` | n/a | yes |
+| [db\_subnet\_group\_name](#input\_db\_subnet\_group\_name) | Name of the RDS subnet group | `string` | n/a | yes |
+| [disable\_ingestion\_schedule](#input\_disable\_ingestion\_schedule) | Disable the ingestion schedule | `bool` | `false` | no |
+| [environment\_type](#input\_environment\_type) | Environment type | `string` | n/a | yes |
+| [ingestion\_schedule](#input\_ingestion\_schedule) | Cron schedule for the CloudWatch Event Rule | `string` | `"rate(24 hours)"` | no |
+| [permissions\_boundary\_arn](#input\_permissions\_boundary\_arn) | ARN of the permissions boundary to use for the IAM role | `string` | n/a | yes |
+| [project\_name](#input\_project\_name) | Name of the project | `string` | `"secrets-finder"` | no |
+| [rds\_db\_name](#input\_rds\_db\_name) | Name of the database to create in the RDS instance | `string` | `"secrets_finder"` | no |
+| [rds\_username](#input\_rds\_username) | Username for the RDS instance | `string` | `"secrets_finder"` | no |
+| [s3\_bucket\_name](#input\_s3\_bucket\_name) | Name of the S3 bucket to create | `string` | n/a | yes |
+| [subnet\_name](#input\_subnet\_name) | Name of the subnet where to deploy the resources (wildcards are allowed: first match is used) | `string` | n/a | yes |
+| [tags](#input\_tags) | A map of tags to add to the resources | `map(string)` | n/a | yes |
+| [vpc\_name](#input\_vpc\_name) | Identifier of the VPC to use for secrets-finder | `string` | n/a | yes |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| [rds\_pg\_endpoint](#output\_rds\_pg\_endpoint) | n/a |
+
diff --git a/infrastructure/ingestion/aws/cloudwatch.tf b/infrastructure/ingestion/aws/cloudwatch.tf
new file mode 100644
index 0000000..90571d3
--- /dev/null
+++ b/infrastructure/ingestion/aws/cloudwatch.tf
@@ -0,0 +1,17 @@
+resource "aws_cloudwatch_event_rule" "ingestion_sfn_trigger_rule" {
+ name = "${var.project_name}-ingestion-sfn-trigger"
+ description = "Triggers the Step function on schedule"
+ schedule_expression = var.ingestion_schedule
+ state = var.disable_ingestion_schedule ? "DISABLED" : "ENABLED"
+}
+
+resource "aws_cloudwatch_event_target" "ingestion_sfn_trigger" {
+ rule = aws_cloudwatch_event_rule.ingestion_sfn_trigger_rule.name
+ arn = aws_sfn_state_machine.ingestion-step-function.arn
+ role_arn = aws_iam_role.cloudwatch_role.arn
+
+ depends_on = [
+ aws_iam_role.cloudwatch_role,
+ aws_iam_role_policy.cloudwatch_policy,
+ ]
+}
diff --git a/infrastructure/ingestion/aws/configuration/ingestion_sfn_definition.json b/infrastructure/ingestion/aws/configuration/ingestion_sfn_definition.json
new file mode 100644
index 0000000..db2f5fd
--- /dev/null
+++ b/infrastructure/ingestion/aws/configuration/ingestion_sfn_definition.json
@@ -0,0 +1,95 @@
+{
+ "Comment": "Ingestion State Machine",
+ "StartAt": "BootStrapState",
+ "States": {
+ "BootStrapState": {
+ "Type": "Task",
+ "Resource": "${migrate_lambda_arn}",
+ "Next": "IngestionState"
+ },
+ "IngestionState": {
+ "Type": "Parallel",
+ "Branches": [
+ {
+ "Comment": "Ingest Scheduled Scan Findings",
+ "StartAt": "ListScheduledScanFindingsFiles",
+ "States": {
+ "ListScheduledScanFindingsFiles": {
+ "Type": "Task",
+ "Resource": "${ingestion_lambda_arn}",
+ "ResultPath": "$.lambdaResult",
+ "Parameters": {
+ "action": "list_files",
+ "prefix": "secrets-finder/scheduled-scans/results/"
+ },
+ "Next": "IngestScheduledScanFindingsFiles"
+ },
+ "IngestScheduledScanFindingsFiles": {
+ "Type": "Map",
+ "ItemsPath": "$.lambdaResult.body.files",
+ "Parameters": {
+ "index.$": "$$.Map.Item.Index",
+ "key.$": "$$.Map.Item.Value"
+ },
+ "Iterator": {
+ "StartAt": "IngestScheduledScanFindings",
+ "States": {
+ "IngestScheduledScanFindings": {
+ "Type": "Task",
+ "Resource": "${ingestion_lambda_arn}",
+ "Parameters": {
+ "action": "ingest_findings",
+ "file_key.$": "$.key"
+ },
+ "End": true
+ }
+ }
+ },
+ "End": true
+ }
+ }
+ },
+ {
+ "Comment": "Ingest Ongoing Scan Findings",
+ "StartAt": "ListOngoingScanFindingsFiles",
+ "States": {
+ "ListOngoingScanFindingsFiles": {
+ "Type": "Task",
+ "Resource": "${ingestion_lambda_arn}",
+ "ResultPath": "$.lambdaResult",
+ "Parameters": {
+ "action": "list_files",
+ "prefix": "secrets-finder/ongoing-scans/results/"
+ },
+ "Next": "IngestOngoingScanFindingsFiles"
+ },
+ "IngestOngoingScanFindingsFiles": {
+ "Type": "Map",
+ "ItemsPath": "$.lambdaResult.body.files",
+ "Parameters": {
+ "index.$": "$$.Map.Item.Index",
+ "key.$": "$$.Map.Item.Value"
+ },
+ "Iterator": {
+ "StartAt": "IngestOngoingScanFindings",
+ "States": {
+ "IngestOngoingScanFindings": {
+ "Type": "Task",
+ "Resource": "${ingestion_lambda_arn}",
+ "Parameters": {
+ "action": "ingest_findings",
+ "file_key.$": "$.key"
+ },
+ "End": true
+ }
+ }
+ },
+ "End": true
+ }
+ }
+ }
+ ],
+ "End": true
+ }
+ }
+}
diff --git a/infrastructure/ingestion/aws/iam.tf b/infrastructure/ingestion/aws/iam.tf
new file mode 100644
index 0000000..15dbf9c
--- /dev/null
+++ b/infrastructure/ingestion/aws/iam.tf
@@ -0,0 +1,160 @@
+# Lambda execution role
+data "aws_iam_policy_document" "lambda_assume_role" {
+ statement {
+ effect = "Allow"
+ principals {
+ identifiers = ["lambda.amazonaws.com"]
+ type = "Service"
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+resource "aws_iam_role" "lambda_execution_role" {
+ name = "${var.project_name}-ingestion-lambda-execution-role"
+ assume_role_policy = data.aws_iam_policy_document.lambda_assume_role.json
+ path = "/"
+ permissions_boundary = var.permissions_boundary_arn
+}
+
+data "aws_iam_policy_document" "permissions_for_execution_role" {
+ statement {
+ sid = "WriteToCloudWatchLogGroup"
+ effect = "Allow"
+ actions = [
+ "logs:CreateLogStream",
+ "logs:PutLogEvents",
+ ]
+ resources = ["arn:aws:logs:*:*:*"]
+ }
+
+ statement {
+ sid = "AllowAccessToBucket"
+ effect = "Allow"
+ actions = [
+ "s3:ListBucket",
+ "s3:GetObject",
+ "s3:DeleteObject"
+ ]
+ resources = [
+ "${local.s3_bucket_arn}",
+ "${local.s3_bucket_arn}/*"
+ ]
+ }
+
+ statement {
+ sid = "AllowAccessToRDS"
+ effect = "Allow"
+ actions = [
+ "rds-data:ExecuteStatement",
+ "rds-data:BatchExecuteStatement",
+ "rds-data:BeginTransaction",
+ "rds-data:CommitTransaction",
+ "rds-data:RollbackTransaction"
+ ]
+ resources = [
+ aws_db_instance.rds_postgres.arn
+ ]
+ }
+
+ statement {
+ sid = "AllowEC2Perms"
+ effect = "Allow"
+ actions = [
+ "ec2:DescribeNetworkInterfaces",
+ "ec2:CreateNetworkInterface",
+ "ec2:DeleteNetworkInterface",
+ "ec2:DescribeInstances",
+ "ec2:AttachNetworkInterface"
+ ]
+ resources = ["*"]
+ }
+}
+
+resource "aws_iam_policy" "policy_for_execution_role" {
+ name = "${var.project_name}-ingestion-lambda-execution-role-permissions"
+ description = "Policy granting necessary permissions to Lambda execution instance"
+ policy = data.aws_iam_policy_document.permissions_for_execution_role.json
+}
+
+resource "aws_iam_role_policy_attachment" "LambdaExecutionRolePolicyAttachment" {
+ policy_arn = aws_iam_policy.policy_for_execution_role.arn
+ role = aws_iam_role.lambda_execution_role.name
+}
+
+# Step function role
+
+data "aws_iam_policy_document" "sf_assume_role" {
+ statement {
+ effect = "Allow"
+ principals {
+ identifiers = ["states.amazonaws.com"]
+ type = "Service"
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+resource "aws_iam_role" "sfn_role" {
+ name = "${var.project_name}-ingestion-sf-execution-role"
+ path = "/"
+ permissions_boundary = var.permissions_boundary_arn
+ assume_role_policy = data.aws_iam_policy_document.sf_assume_role.json
+}
+
+data "aws_iam_policy_document" "sfn_policy_document" {
+ statement {
+ effect = "Allow"
+ actions = [
+ "lambda:InvokeFunction"
+ ]
+ resources = [
+ aws_lambda_function.ingestion-lambda.arn,
+ aws_lambda_function.migration-lambda.arn
+ ]
+ }
+}
+
+resource "aws_iam_role_policy" "sfn_policy" {
+ name = "${var.project_name}-ingestion-sf-execution-policy"
+ role = aws_iam_role.sfn_role.id
+ policy = data.aws_iam_policy_document.sfn_policy_document.json
+}
+
+# Cloudwatch role
+
+data "aws_iam_policy_document" "cloudwatch_assume_role" {
+ statement {
+ effect = "Allow"
+ principals {
+ identifiers = ["events.amazonaws.com"]
+ type = "Service"
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+resource "aws_iam_role" "cloudwatch_role" {
+ name = "${var.project_name}-ingestion-cloud-watch-role"
+ path = "/"
+ permissions_boundary = var.permissions_boundary_arn
+ assume_role_policy = data.aws_iam_policy_document.cloudwatch_assume_role.json
+}
+
+data "aws_iam_policy_document" "cloudwatch_policy_document" {
+ statement {
+ effect = "Allow"
+ actions = [
+ "states:StartExecution"
+ ]
+ resources = [
+ aws_sfn_state_machine.ingestion-step-function.arn
+ ]
+ }
+}
+
+resource "aws_iam_role_policy" "cloudwatch_policy" {
+ name = "${var.project_name}-cloudwatch-event-policy"
+ role = aws_iam_role.cloudwatch_role.id
+ policy = data.aws_iam_policy_document.cloudwatch_policy_document.json
+}
diff --git a/infrastructure/ingestion/aws/lambda.tf b/infrastructure/ingestion/aws/lambda.tf
new file mode 100644
index 0000000..c7d7b03
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda.tf
@@ -0,0 +1,100 @@
+resource "null_resource" "ingestion_lambda_build" {
+ provisioner "local-exec" {
+ command = "./package.sh"
+ working_dir = "${local.ingestion_lambda_dir}/"
+ }
+
+ triggers = {
+ always_run = timestamp()
+ }
+}
+
+data "local_file" "ingestion_lambda_build" {
+ filename = local.ingestion_lambda_archive
+ depends_on = [null_resource.ingestion_lambda_build]
+}
+
+resource "aws_lambda_function" "ingestion-lambda" {
+ function_name = "${var.project_name}-ingestion-lambda"
+ role = aws_iam_role.lambda_execution_role.arn
+ architectures = ["arm64"]
+ runtime = "python3.9"
+ handler = "ingestion.handler"
+ timeout = 900 # 15 minutes
+ memory_size = 512 # 512 MB
+ filename = local.ingestion_lambda_archive
+ source_code_hash = data.local_file.ingestion_lambda_build.content_sha256
+
+ vpc_config {
+ subnet_ids = [data.aws_subnet.selected.id]
+ security_group_ids = [aws_security_group.lambda_sg.id]
+ }
+
+ ephemeral_storage {
+ size = 1024 # 1 GB
+ }
+
+ environment {
+ variables = {
+ BUCKET_NAME = var.s3_bucket_name
+ DB_URL = local.db_url
+ }
+ }
+
+ depends_on = [
+ data.local_file.ingestion_lambda_build,
+ aws_iam_role.lambda_execution_role,
+ aws_iam_policy.policy_for_execution_role,
+ aws_iam_role_policy_attachment.LambdaExecutionRolePolicyAttachment
+ ]
+}
+
+resource "null_resource" "migration_lambda_build" {
+ provisioner "local-exec" {
+ command = "./package.sh"
+ working_dir = "${local.migration_lambda_dir}/"
+ }
+
+ triggers = {
+ always_run = timestamp()
+ }
+}
+
+data "local_file" "migration_lambda_build" {
+ filename = local.migration_lambda_archive
+ depends_on = [null_resource.migration_lambda_build]
+}
+
+resource "aws_lambda_function" "migration-lambda" {
+ function_name = "${var.project_name}-migration-lambda"
+ role = aws_iam_role.lambda_execution_role.arn
+ architectures = ["arm64"]
+ runtime = "python3.9"
+ handler = "migrate.migrate"
+ timeout = 60 # 1 minute
+ memory_size = 512 # 512 MB
+ filename = local.migration_lambda_archive
+ source_code_hash = data.local_file.migration_lambda_build.content_sha256
+
+ vpc_config {
+ subnet_ids = [data.aws_subnet.selected.id]
+ security_group_ids = [aws_security_group.lambda_sg.id]
+ }
+
+ ephemeral_storage {
+ size = 512 # 512 MB
+ }
+
+ environment {
+ variables = {
+ DB_URL = local.db_url
+ }
+ }
+
+ depends_on = [
+ data.local_file.migration_lambda_build,
+ aws_iam_role.lambda_execution_role,
+ aws_iam_policy.policy_for_execution_role,
+ aws_iam_role_policy_attachment.LambdaExecutionRolePolicyAttachment
+ ]
+}
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/Dockerfile b/infrastructure/ingestion/aws/lambda/ingestion/Dockerfile
new file mode 100644
index 0000000..b81d74f
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/Dockerfile
@@ -0,0 +1,12 @@
+FROM python@sha256:320a7a4250aba4249f458872adecf92eea88dc6abd2d76dc5c0f01cac9b53990
+
+RUN pip install poetry==1.8.3 --no-cache-dir
+
+WORKDIR /app
+
+COPY . /app/
+
+RUN poetry self add poetry-plugin-lambda-build \
+ && poetry self add poetry-plugin-export \
+ && poetry lock --no-update \
+ && poetry build-lambda
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/README.md b/infrastructure/ingestion/aws/lambda/ingestion/README.md
new file mode 100644
index 0000000..2a8579c
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/README.md
@@ -0,0 +1,37 @@
+# Ingestion
+
+This directory contains data ingestion lambda. The Lambda is invoked by a Step Function.
+
+The packaging process uses the Poetry Lambda plugin and Docker to generate Lambda packages for the correct platform. This is automated when applying Terraform.
+
+Lambda takes a set of actions as input. Each action performs a specific function.
+
+## Lambda Actions
+
+- `list_files` : This action list files in a S3 bucket at a give prefix
+ Example:
+ ```json
+ {
+ "action": "list_files",
+ "prefix": "secrets-finder/scheduled-scans/results/"
+ }
+ ```
+- `ingest_findings` : This action read a given `.json` file and create new records in `findings`, `scans` and `jobs` table. Corresponding file is deleted from S3 on successful ingestion
+ Example:
+ ```json
+ {
+ "action": "ingest_findings",
+ "file_key": "secrets-finder/scheduled-scans/results/7eb4d1ab-ac6a-4b84-a18d-4bd944d4ef2a.json"
+ }
+ ```
+
+## Add New Ingestion
+
+Creating a new ingestion is a 4 step process.
+
+1. Create necessary DB migration version under `migrations` directory. Refer [Create New Revisions](../../../../../migrations/README.md#creating-new-revision)
+2. Create a new ingestion script under `modules` directory.
+3. Register new ingestion with an action in `ingestion.py` under `ingestion_callback_mapping`
+4. Add a new branch in [step function definition](../../configuration/ingestion_sfn_definition.json).
+
+Use `terraform apply` to build and deploy the Lambda. Once deployed, the next Step Function invocation will automatically trigger the new ingestion.
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/ingestion.py b/infrastructure/ingestion/aws/lambda/ingestion/ingestion.py
new file mode 100644
index 0000000..442b9c7
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/ingestion.py
@@ -0,0 +1,68 @@
+import os
+from typing import List, Dict, Any, Callable, Union
+import logging
+from modules.common.s3 import S3
+from modules.findings_ingestion import ingest_findings
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(levelname)s - %(message)s",
+ handlers=[logging.StreamHandler()],
+)
+
+bucket_name: str = os.environ.get("BUCKET_NAME")
+db_url: str = os.environ.get("DB_URL")
+
+ingestion_callback_mapping: Dict[str, Callable[[str, str, str], bool]] = {
+ "ingest_findings": ingest_findings
+}
+
+
+def list_files(prefix: str) -> Dict[str, Union[int, Dict[str, List[str]]]]:
+ s3 = S3(bucket_name)
+ files = s3.list_files(prefix)
+ return {"statusCode": 200, "body": {"files": files}}
+
+
+def handler(event: Dict[str, Any], _) -> Dict[str, Any]:
+ """
+ Handle the Lambda function invocation.
+
+ Args:
+ event (Dict[str, Any]): The event data passed to the Lambda function.
+ _ (Any): The context object representing the runtime information.
+
+ Returns:
+ Dict[str, Any]: The response data returned by the Lambda function.
+
+ Raises:
+ ValueError: If the request is invalid or the action is not supported.
+ """
+ action: str = event.get("action")
+
+ if action == "list_files":
+ prefix: str = event.get("prefix")
+ if not prefix:
+ logging.error("missing prefix in request for action list_files")
+ raise ValueError("Invalid request")
+
+ response: Dict[str, Union[int, Dict[str, List[str]]]] = list_files(prefix)
+ return response
+
+ elif action in ingestion_callback_mapping:
+ file_key: str = event.get("file_key")
+ if not file_key:
+ logging.error("missing file_key in request for action ingest_findings")
+ raise ValueError("Invalid request")
+
+ status: bool = ingestion_callback_mapping[action](db_url, bucket_name, file_key)
+
+ if not status:
+ logging.error("Error ingesting data")
+ raise ValueError("Error ingesting data")
+
+ return {"statusCode": 200, "body": {"success": status}}
+
+ else:
+ logging.error(f"Invalid action: {action}")
+ raise ValueError("Invalid request")
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/__init__.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/common/__init__.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/common/s3.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/s3.py
new file mode 100644
index 0000000..a512dd6
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/s3.py
@@ -0,0 +1,123 @@
+import os
+import tempfile
+import boto3
+from typing import List, Tuple
+
+
+class S3:
+ """
+ Represents an S3 client for interacting with an S3 bucket.
+
+ Args:
+ bucket_name (str): The name of the S3 bucket.
+
+ Attributes:
+ client (boto3.client): The S3 client.
+ bucket_name (str): The name of the S3 bucket.
+
+ """
+
+ client: boto3.client = None
+ bucket_name: str = None
+
+ def __init__(self, bucket_name: str) -> None:
+ """
+ Initializes the S3 client.
+
+ Args:
+ bucket_name (str): The name of the S3 bucket.
+
+ """
+ self.client = boto3.client("s3")
+ self.bucket_name = bucket_name
+
+ def list_files(self, prefix: str) -> List[str]:
+ """
+ Lists all the files in the S3 bucket with the specified prefix.
+
+ Args:
+ prefix (str): The prefix to filter the files.
+
+ Returns:
+ List[str]: A list of file keys.
+
+ """
+ keys: List[str] = []
+ continuation_token: str = None
+
+ if not prefix.endswith("/"):
+ prefix += "/"
+
+ while True:
+ kwargs: dict = {
+ "Bucket": self.bucket_name,
+ "Prefix": prefix,
+ "Delimiter": "/",
+ }
+
+ if continuation_token:
+ kwargs["ContinuationToken"] = continuation_token
+
+ response: dict = self.client.list_objects_v2(**kwargs)
+ contents: List[dict] = response.get("Contents", [])
+ _keys: List[str] = [
+ content["Key"]
+ for content in contents
+ if not content["Key"].endswith("/")
+ ]
+ keys.extend(_keys)
+
+ if not response.get("IsTruncated"):
+ break
+
+ continuation_token = response.get("NextContinuationToken")
+
+ return keys
+
+ def download_file(self, file_key: str) -> str:
+ """
+ Downloads the file with the specified key from the bucket.
+
+ Args:
+ file_key (str): The key of the file to download.
+
+ Returns:
+ str: The local path of the downloaded file.
+
+ """
+ file_name: str = os.path.basename(file_key)
+ local_path: str = os.path.join(tempfile.gettempdir(), file_name)
+ self.client.download_file(self.bucket_name, file_key, local_path)
+ return local_path
+
+ def download_first_file(self, prefix: str) -> Tuple[str, str]:
+ """
+ Downloads the first file with the specified prefix from the bucket.
+
+ Args:
+ prefix (str): The prefix to filter the files.
+
+ Returns:
+ Tuple[str, str]: A tuple containing the file key and the local path of the downloaded file.
+
+ """
+ files = self.list_files(prefix)
+ if not files:
+ return None
+
+ key = files[0]
+ return key, self.download_file(files[0])
+
+ def delete_file(self, file_key: str) -> bool:
+ """
+ Deletes the file with the specified key from the bucket.
+
+ Args:
+ file_key (str): The key of the file to delete.
+
+ Returns:
+ bool: True if the file was successfully deleted, False otherwise.
+
+ """
+ self.client.delete_object(Bucket=self.bucket_name, Key=file_key)
+ return True
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/common/utils.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/utils.py
new file mode 100644
index 0000000..c61ddd0
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/modules/common/utils.py
@@ -0,0 +1 @@
+DATE_TIME_FORMAT: str = "%Y-%m-%dT%H:%M:%S.%f"
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/modules/findings_ingestion.py b/infrastructure/ingestion/aws/lambda/ingestion/modules/findings_ingestion.py
new file mode 100644
index 0000000..9246eb9
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/modules/findings_ingestion.py
@@ -0,0 +1,197 @@
+import datetime
+import json
+import logging
+from sqlalchemy import (
+ JSON,
+ VARCHAR,
+ Boolean,
+ Column,
+ Integer,
+ String,
+ DateTime,
+ create_engine,
+)
+from sqlalchemy.orm import sessionmaker, declarative_base
+from modules.common.s3 import S3
+from modules.common.utils import DATE_TIME_FORMAT
+import uuid
+
+Base = declarative_base()
+
+
+class Finding(Base):
+ __tablename__ = "findings"
+ uuid = Column(String, primary_key=True)
+ scan_uuid = Column(String, nullable=False)
+ job_uuid = Column(String, nullable=False)
+ organization = Column(String, nullable=True)
+ scan_context = Column(String, nullable=False)
+ created_on = Column(DateTime, nullable=False)
+ decoder_name = Column(String, nullable=False)
+ detector_name = Column(String, nullable=False)
+ detector_type = Column(Integer, nullable=False)
+ raw = Column(VARCHAR, nullable=False)
+ raw_v2 = Column(VARCHAR, nullable=True)
+ redacted = Column(String, nullable=True)
+ source_name = Column(String, nullable=False)
+ source_type = Column(Integer, nullable=False)
+ verified = Column(Boolean, nullable=False)
+ extra_data = Column(JSON, nullable=True)
+ repository = Column(String, nullable=True)
+ filename = Column(String, nullable=False)
+ commit_hash = Column(String, nullable=True)
+ committer_email = Column(String, nullable=True)
+ commit_timestamp = Column(DateTime, nullable=True)
+ line_number = Column(Integer, nullable=False)
+ is_still_valid = Column(Boolean, nullable=False)
+ last_validated_on = Column(DateTime, nullable=False)
+
+
+class Scans(Base):
+ __tablename__ = "scans"
+ uuid = Column(String, primary_key=True)
+ job_uuid = Column(String, nullable=False)
+ scan_identifier = Column(String, nullable=True)
+ scm = Column(String, nullable=False)
+ organization = Column(String, nullable=True)
+ repository = Column(String, nullable=False)
+ scan_context = Column(String, nullable=False)
+ started_on = Column(DateTime, nullable=False)
+ completed_on = Column(DateTime, nullable=False)
+ status = Column(Integer, nullable=False)
+ scan_mode = Column(String, nullable=False)
+ scan_type = Column(String, nullable=False)
+ # metadata is a reserved attribute name in SQLAlchemy
+ metadata_ = Column("metadata", JSON, nullable=True)
+
+
+class Jobs(Base):
+ __tablename__ = "jobs"
+ uuid = Column(String, primary_key=True)
+ scan_identifier = Column(String, nullable=False)
+ scm = Column(String, nullable=False)
+ scan_context = Column(String, nullable=False)
+ started_on = Column(DateTime, nullable=False)
+ completed_on = Column(DateTime, nullable=False)
+ status = Column(Integer, nullable=False)
+ scan_mode = Column(String, nullable=False)
+ scan_type = Column(String, nullable=False)
+
+
+def ingest_findings(db_url: str, bucket_name: str, file_key: str) -> bool:
+ """
+ Ingests findings from a file downloaded from S3 into a database.
+
+ Args:
+ db_url (str): The URL of the database to connect to.
+ bucket_name (str): The name of the S3 bucket.
+ file_key (str): The key of the file in the S3 bucket.
+
+ Returns:
+ bool: True if the ingestion is successful, False otherwise.
+ """
+ logging.info(f"Downloading file from S3, key: {file_key}, bucket: {bucket_name}")
+ s3 = S3(bucket_name)
+ file_path = s3.download_file(file_key)
+ logging.info(f"File downloaded to {file_path}, key: {file_key}")
+
+ with open(file_path, "r") as file:
+ data = json.load(file)
+
+ if not data:
+ logging.error("No data in the file")
+ return False
+
+ # Create a SQLAlchemy engine to connect to the database
+ engine = create_engine(db_url)
+
+ # Create a session
+ Session = sessionmaker(bind=engine)
+ session = Session()
+
+ job = Jobs(
+ uuid=data["scan_uuid"],
+ scan_identifier=data["scan_identifier"],
+ scm=data["scm"],
+ scan_context=data["scan_context"],
+ started_on=datetime.datetime.strptime(data["start"], DATE_TIME_FORMAT),
+ completed_on=datetime.datetime.strptime(data["end"], DATE_TIME_FORMAT),
+ status=data["status"],
+ scan_type=data["scan_type"],
+ scan_mode=data["scan_mode"],
+ )
+
+ session.add(job)
+
+ for result in data.get("results", []):
+ scan = Scans(
+ uuid=result["scan_uuid"],
+ job_uuid=job.uuid,
+ scan_identifier=job.scan_identifier,
+ scm=job.scm,
+ organization=result["organization"],
+ repository=result["repository"],
+ scan_context=job.scan_context,
+ started_on=datetime.datetime.strptime(result["start"], DATE_TIME_FORMAT),
+ completed_on=datetime.datetime.strptime(result["end"], DATE_TIME_FORMAT),
+ status=result.get("status"),
+ scan_mode=job.scan_mode,
+ scan_type=job.scan_type,
+ metadata_=result.get("metadata", {}),
+ )
+
+ logging.info(f'Ingesting scan: {result["scan_uuid"]}')
+ session.add(scan)
+
+ for finding in result.get("findings", []):
+ source_meta_data = list(
+ finding.get("SourceMetadata", {}).get("Data", {}).values()
+ )[0]
+ finding = Finding(
+ uuid=str(uuid.uuid4()),
+ scan_uuid=result["scan_uuid"],
+ job_uuid=job.uuid,
+ organization=result["organization"],
+ scan_context=job.scan_context,
+ created_on=datetime.datetime.now(),
+ decoder_name=finding["DetectorName"],
+ detector_name=finding["DetectorName"],
+ detector_type=finding["DetectorType"],
+ raw=finding["Raw"],
+ raw_v2=finding.get("RawV2", ""),
+ redacted=finding.get("Redacted", ""),
+ source_name=finding["SourceName"],
+ source_type=finding["SourceType"],
+ verified=finding["Verified"],
+ extra_data=finding.get("ExtraData", {}),
+ repository=result["repository"],
+ filename=source_meta_data["file"],
+ commit_hash=source_meta_data.get("commit"),
+ committer_email=source_meta_data.get("email"),
+ commit_timestamp=(
+ datetime.datetime.strptime(
+ source_meta_data.get("timestamp"), "%Y-%m-%d %H:%M:%S %z"
+ )
+ if source_meta_data.get("timestamp")
+ else None
+ ),
+ line_number=source_meta_data["line"],
+ is_still_valid=finding["Verified"],
+ last_validated_on=datetime.datetime.strptime(
+ result["end"], DATE_TIME_FORMAT
+ ),
+ )
+
+ logging.info(
+ f'Ingesting finding: {finding.uuid} for scan: {result["scan_uuid"]}'
+ )
+ session.add(finding)
+
+ if not s3.delete_file(file_key):
+ logging.error(f"Error deleting file from S3, key: {file_key}")
+ session.rollback()
+ return False
+
+ logging.info(f"Deleted file from S3, key: {file_key}")
+ session.commit()
+ return True
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/package.sh b/infrastructure/ingestion/aws/lambda/ingestion/package.sh
new file mode 100755
index 0000000..2866648
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/package.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+docker build -t ingestion-lambda --platform=linux/arm64 -f Dockerfile .
+
+docker run --rm -v $(pwd):/output ingestion-lambda cp /app/ingestion.zip /output/
diff --git a/infrastructure/ingestion/aws/lambda/ingestion/pyproject.toml b/infrastructure/ingestion/aws/lambda/ingestion/pyproject.toml
new file mode 100644
index 0000000..e1c9049
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/ingestion/pyproject.toml
@@ -0,0 +1,21 @@
+[tool.poetry]
+name = "ingestion"
+version = "0.1.0"
+description = "Lambda to ingest data into the data lake"
+authors = ["Thomson Reuters "]
+license = "mit"
+readme = "README.md"
+include = ["modules/*.py", "modules/common/*"]
+
+[tool.poetry.dependencies]
+python = "^3.9"
+boto3 = "^1.34.130"
+sqlalchemy = "^2.0.31"
+psycopg2-binary = "^2.9.9"
+
+[tool.poetry-plugin-lambda-build]
+package_artifact_path = "ingestion.zip"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/infrastructure/ingestion/aws/lambda/migration/Dockerfile b/infrastructure/ingestion/aws/lambda/migration/Dockerfile
new file mode 100644
index 0000000..b81d74f
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/migration/Dockerfile
@@ -0,0 +1,12 @@
+FROM python@sha256:320a7a4250aba4249f458872adecf92eea88dc6abd2d76dc5c0f01cac9b53990
+
+RUN pip install poetry==1.8.3 --no-cache-dir
+
+WORKDIR /app
+
+COPY . /app/
+
+RUN poetry self add poetry-plugin-lambda-build \
+ && poetry self add poetry-plugin-export \
+ && poetry lock --no-update \
+ && poetry build-lambda
diff --git a/infrastructure/ingestion/aws/lambda/migration/README.md b/infrastructure/ingestion/aws/lambda/migration/README.md
new file mode 100644
index 0000000..3e2206c
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/migration/README.md
@@ -0,0 +1,16 @@
+# Migration Lambda
+
+This directory contains all the necessary scripts to package Migrations (located at the root level) as a Lambda function.
+
+The packaging process uses the Poetry Lambda plugin and leverages Docker to ensure Lambda packages are generated for the correct platform.
+
+## Usage
+
+To package lambda, run following command
+
+```bash
+./package.sh
+```
+
+> [!NOTE]
+> Any changes in Migrations should be automatically detected during the repackaging process.
diff --git a/infrastructure/ingestion/aws/lambda/migration/package.sh b/infrastructure/ingestion/aws/lambda/migration/package.sh
new file mode 100755
index 0000000..3ff9917
--- /dev/null
+++ b/infrastructure/ingestion/aws/lambda/migration/package.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+ROOT_DIR=$(git rev-parse --show-toplevel)
+
+docker build -t migrate-lambda --platform=linux/arm64 -f Dockerfile $ROOT_DIR/migrations
+
+docker run --rm -v $(pwd):/output migrate-lambda cp /app/migration.zip /output/
diff --git a/infrastructure/ingestion/aws/locals.tf b/infrastructure/ingestion/aws/locals.tf
new file mode 100644
index 0000000..ebf70ba
--- /dev/null
+++ b/infrastructure/ingestion/aws/locals.tf
@@ -0,0 +1,11 @@
+locals {
+ environment = replace(lower(var.environment_type), " ", "-")
+ db_url = "postgresql://${var.rds_username}:${random_password.rds_master_password.result}@${aws_db_instance.rds_postgres.address}"
+ configuration_dir = "${path.module}/configuration"
+ ingestion_lambda_dir = "${path.module}/lambda/ingestion"
+ ingestion_lambda_archive = "${local.ingestion_lambda_dir}/ingestion.zip"
+ migration_lambda_dir = "${path.module}/lambda/migration"
+ migration_lambda_archive = "${local.migration_lambda_dir}/migration.zip"
+ s3_bucket_arn = "arn:aws:s3:::${var.s3_bucket_name}"
+ tags = var.tags
+}
diff --git a/infrastructure/ingestion/aws/outputs.tf b/infrastructure/ingestion/aws/outputs.tf
new file mode 100644
index 0000000..185e483
--- /dev/null
+++ b/infrastructure/ingestion/aws/outputs.tf
@@ -0,0 +1,3 @@
+output "rds_pg_endpoint" {
+ value = aws_db_instance.rds_postgres.endpoint
+}
diff --git a/infrastructure/ingestion/aws/providers.tf b/infrastructure/ingestion/aws/providers.tf
new file mode 100644
index 0000000..d3d1e83
--- /dev/null
+++ b/infrastructure/ingestion/aws/providers.tf
@@ -0,0 +1,23 @@
+terraform {
+ required_version = ">=1.3"
+
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = "~> 5.0"
+ }
+ }
+
+ backend "s3" {
+ encrypt = true
+ }
+}
+
+provider "aws" {
+ region = var.aws_region
+ profile = var.aws_profile
+
+ default_tags {
+ tags = local.tags
+ }
+}
diff --git a/infrastructure/ingestion/aws/rds.tf b/infrastructure/ingestion/aws/rds.tf
new file mode 100644
index 0000000..a133b4f
--- /dev/null
+++ b/infrastructure/ingestion/aws/rds.tf
@@ -0,0 +1,18 @@
+resource "aws_db_instance" "rds_postgres" {
+ identifier = "${var.project_name}-rds-postgres"
+ allocated_storage = 10
+ engine = "postgres"
+ engine_version = "16.3"
+ instance_class = "db.t3.micro" # Smallest instance type for PostgreSQL
+ username = var.rds_username
+ password = random_password.rds_master_password.result
+ parameter_group_name = "default.postgres16"
+ db_name = var.rds_db_name
+ skip_final_snapshot = true
+ publicly_accessible = false
+ storage_encrypted = true
+ deletion_protection = true
+ backup_retention_period = 7
+ vpc_security_group_ids = [aws_security_group.rds_sg.id]
+ db_subnet_group_name = var.db_subnet_group_name
+}
diff --git a/infrastructure/ingestion/aws/s3.tfbackend b/infrastructure/ingestion/aws/s3.tfbackend
new file mode 100644
index 0000000..6fa4016
--- /dev/null
+++ b/infrastructure/ingestion/aws/s3.tfbackend
@@ -0,0 +1,5 @@
+bucket = ""
+key = ""
+region = ""
+dynamodb_table = ""
+profile = ""
diff --git a/infrastructure/ingestion/aws/secrets.tf b/infrastructure/ingestion/aws/secrets.tf
new file mode 100644
index 0000000..e43778b
--- /dev/null
+++ b/infrastructure/ingestion/aws/secrets.tf
@@ -0,0 +1,19 @@
+resource "random_password" "rds_master_password" {
+ length = 40
+ special = true
+ min_special = 5
+ override_special = "!#$%^&*()-_=+[]{}<>:?"
+ keepers = {
+ pass_version = 1
+ }
+}
+
+resource "aws_secretsmanager_secret" "rds_master_password" {
+ name = "${var.project_name}-rds-master-password"
+ description = "Master password for RDS instance"
+}
+
+resource "aws_secretsmanager_secret_version" "rds_master_password" {
+ secret_id = aws_secretsmanager_secret.rds_master_password.id
+ secret_string = random_password.rds_master_password.result
+}
diff --git a/infrastructure/ingestion/aws/securitygroups.tf b/infrastructure/ingestion/aws/securitygroups.tf
new file mode 100644
index 0000000..9e23393
--- /dev/null
+++ b/infrastructure/ingestion/aws/securitygroups.tf
@@ -0,0 +1,33 @@
+resource "aws_security_group" "rds_sg" {
+ name = "${var.project_name}-rds-sg"
+ description = "Security group for RDS instance"
+ vpc_id = data.aws_vpc.selected.id
+
+ ingress {
+ from_port = 5432
+ to_port = 5432
+ protocol = "tcp"
+ cidr_blocks = [data.aws_vpc.selected.cidr_block]
+ }
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+}
+
+
+resource "aws_security_group" "lambda_sg" {
+ name = "${var.project_name}-lambda-sg"
+ description = "Security group for Lambda functions"
+ vpc_id = data.aws_vpc.selected.id
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+}
diff --git a/infrastructure/ingestion/aws/sfn.tf b/infrastructure/ingestion/aws/sfn.tf
new file mode 100644
index 0000000..540f09e
--- /dev/null
+++ b/infrastructure/ingestion/aws/sfn.tf
@@ -0,0 +1,13 @@
+resource "aws_sfn_state_machine" "ingestion-step-function" {
+ name = "${var.project_name}-ingestion-step-function"
+ role_arn = aws_iam_role.sfn_role.arn
+ definition = templatefile("${local.configuration_dir}/ingestion_sfn_definition.json", {
+ migrate_lambda_arn = "${aws_lambda_function.migration-lambda.arn}",
+ ingestion_lambda_arn = "${aws_lambda_function.ingestion-lambda.arn}"
+ })
+
+ depends_on = [
+ aws_iam_role.sfn_role,
+ aws_iam_role_policy.sfn_policy,
+ ]
+}
diff --git a/infrastructure/ingestion/aws/sts.tf b/infrastructure/ingestion/aws/sts.tf
new file mode 100644
index 0000000..8fc4b38
--- /dev/null
+++ b/infrastructure/ingestion/aws/sts.tf
@@ -0,0 +1 @@
+data "aws_caller_identity" "current" {}
diff --git a/infrastructure/ingestion/aws/terraform.tfvars.example b/infrastructure/ingestion/aws/terraform.tfvars.example
new file mode 100644
index 0000000..85c8a35
--- /dev/null
+++ b/infrastructure/ingestion/aws/terraform.tfvars.example
@@ -0,0 +1,17 @@
+aws_region = ""
+aws_profile = ""
+environment_type = ""
+project_name = ""
+vpc_name = ""
+subnet_name = ""
+db_subnet_group_name = ""
+permissions_boundary_arn = ""
+s3_bucket_name = ""
+tags = {
+ "mytag" = "tag"
+ "mytag2" = "tag2"
+}
+rds_username = ""
+rds_db_name = ""
+ingestion_schedule = ""
+disable_ingestion_schedule = ""
diff --git a/infrastructure/ingestion/aws/variables.tf b/infrastructure/ingestion/aws/variables.tf
new file mode 100644
index 0000000..5b1b594
--- /dev/null
+++ b/infrastructure/ingestion/aws/variables.tf
@@ -0,0 +1,115 @@
+variable "aws_region" {
+ type = string
+ description = "AWS region where to deploy resources"
+
+ validation {
+ condition = can(regex("^(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\\d+$", var.aws_region))
+ error_message = "You should enter a valid AWS region (https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.RegionsAndAvailabilityZones.html)"
+ }
+}
+
+variable "aws_profile" {
+ type = string
+ description = "AWS profile to use for authentication"
+}
+
+variable "environment_type" {
+ type = string
+ description = "Environment type"
+
+ validation {
+ condition = contains(["PRODUCTION", "PRE-PRODUCTION", "QUALITY ASSURANCE", "INTEGRATION TESTING", "DEVELOPMENT", "LAB"], var.environment_type)
+ error_message = "The environment type should be one of the following values: PRODUCTION, PRE-PRODUCTION, QUALITY ASSURANCE, INTEGRATION TESTING, DEVELOPMENT, LAB (case sensitive)"
+ }
+}
+
+variable "vpc_name" {
+ type = string
+ description = "Identifier of the VPC to use for secrets-finder"
+}
+
+variable "subnet_name" {
+ type = string
+ description = "Name of the subnet where to deploy the resources (wildcards are allowed: first match is used)"
+}
+
+variable "db_subnet_group_name" {
+ type = string
+ description = "Name of the RDS subnet group"
+}
+
+variable "tags" {
+ type = map(string)
+ description = "A map of tags to add to the resources"
+
+ validation {
+ condition = alltrue([for v in values(var.tags) : v != ""])
+ error_message = "Tag values must not be empty."
+ }
+}
+
+variable "project_name" {
+ type = string
+ description = "Name of the project"
+ default = "secrets-finder"
+}
+
+variable "permissions_boundary_arn" {
+ type = string
+ description = "ARN of the permissions boundary to use for the IAM role"
+
+ validation {
+ condition = can(regex("^arn:aws:iam::[0-9]{12}:policy\\/([a-zA-Z0-9-_.]+)$", var.permissions_boundary_arn))
+ error_message = "The provided ARN is not a valid ARN for a policy"
+ }
+}
+
+variable "s3_bucket_name" {
+ type = string
+ description = "Name of the S3 bucket to create"
+
+ validation {
+ condition = can(regex("^[a-z0-9.-]{3,63}$", var.s3_bucket_name))
+ error_message = "The S3 bucket name must be a valid string with only a-z0-9.- characters and have a length between 3 and 63"
+ }
+}
+
+variable "rds_username" {
+ type = string
+ description = "Username for the RDS instance"
+ default = "secrets_finder"
+
+ validation {
+ condition = can(regex("^[a-z][a-z0-9_]{1,}$", var.rds_username))
+ error_message = "The RDS username must be a valid string with only a-z0-9_ characters, have a length greater than 1, and not start with a number"
+ }
+}
+
+
+variable "rds_db_name" {
+ type = string
+ description = "Name of the database to create in the RDS instance"
+ default = "secrets_finder"
+
+ validation {
+ condition = can(regex("^[a-z][a-z0-9_]{1,}$", var.rds_db_name))
+ error_message = "The RDS database name must be a valid string with only a-z0-9_ characters, have a length greater than 1, and not start with a number"
+ }
+}
+
+variable "ingestion_schedule" {
+ type = string
+ description = "Cron schedule for the CloudWatch Event Rule"
+ default = "rate(24 hours)"
+
+ validation {
+ condition = can(regex("^(rate|cron)\\(\\d+ (minutes|hours|days)\\)$", var.ingestion_schedule))
+ error_message = "The ingestion schedule should be in the format 'rate(n minutes|hours|days)' or 'cron(expression)', where n is a positive integer"
+ }
+}
+
+variable "disable_ingestion_schedule" {
+ type = bool
+ description = "Disable the ingestion schedule"
+ default = false
+}
diff --git a/infrastructure/ingestion/aws/vpc.tf b/infrastructure/ingestion/aws/vpc.tf
new file mode 100644
index 0000000..041279b
--- /dev/null
+++ b/infrastructure/ingestion/aws/vpc.tf
@@ -0,0 +1,22 @@
+data "aws_vpc" "selected" {
+ filter {
+ name = "tag:Name"
+ values = [var.vpc_name]
+ }
+}
+
+data "aws_subnets" "default" {
+ filter {
+ name = "tag:Name"
+ values = [var.subnet_name]
+ }
+}
+
+data "aws_subnet" "selected" {
+ id = element(sort(data.aws_subnets.default.ids), 0)
+}
+
+data "aws_security_group" "default" {
+ vpc_id = data.aws_vpc.selected.id
+ name = "default"
+}
diff --git a/migrations/README.md b/migrations/README.md
new file mode 100644
index 0000000..d97f2bf
--- /dev/null
+++ b/migrations/README.md
@@ -0,0 +1,51 @@
+# Migrations
+
+This directory contains alembic configuration and database versions to manage database migrations. Migrations are compatible with SQLite, Postgres & MariaDB.
+
+## Files
+
+- `alembic.ini`: Contains alembic configuration.
+
+- `migrate.py`: A wrapper script to programmatically run migrations. Example: using Lambdas.
+
+- `pyproject.toml`: This is the configuration file for the `poetry` package manager, which is used to manage dependencies for the script.
+
+- `db`: This directory contains alembic env config and versions.
+
+## Usage
+
+1. To run, setup the DB_URL environment variables. DB_URL environment variable value should be FQDN. Example: `postgresql://myuser:mypassword@127.0.0.1:5432/mydatabase`
+2. Setup the poetry environment
+ ```bash
+ poetry install
+ ```
+3. Run migration
+ ```bash
+ poetry run alembic upgrade head
+ ```
+
+## Creating New Revisions
+
+New migration revisions are needed whenever there are modifications to the database schema, such as adding a new table, adding a new column, or updating an existing column.
+
+1. Setup the poetry environment, if not already done
+ ```bash
+ poetry install
+ ```
+2. Run command. Provide a comment for the revision
+ ```bash
+ poetry run alembic revision -m ""
+ ```
+ This command will create a new revision file under `db/revisions`. Complete definition for `upgrade` and `downgrade` function.
+
+## Requirements
+
+- Python 3.9 or higher
+- `poetry` package manager
+- Connectivity to the database
+ > By default the database created will be private, you may have to run this script from a compute resource that is authorized to connect
+
+## Dependencies
+
+- alembic = "^1.13.1"
+- psycopg2-binary = "^2.9.9"
diff --git a/migrations/alembic.ini b/migrations/alembic.ini
new file mode 100644
index 0000000..f40a6e1
--- /dev/null
+++ b/migrations/alembic.ini
@@ -0,0 +1,116 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = db
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python>=3.9 or backports.zoneinfo library.
+# Any required deps can installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to ZoneInfo()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to db/versions. When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:db/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = %(DB_URL)s
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts. See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/migrations/db/env.py b/migrations/db/env.py
new file mode 100644
index 0000000..dfd6787
--- /dev/null
+++ b/migrations/db/env.py
@@ -0,0 +1,79 @@
+from logging.config import fileConfig
+import os
+
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+
+from alembic import context
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+config.set_main_option("DB_URL", os.environ["DB_URL"])
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+ fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+target_metadata = None
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+ """Run migrations in 'offline' mode.
+
+ This configures the context with just a URL
+ and not an Engine, though an Engine is acceptable
+ here as well. By skipping the Engine creation
+ we don't even need a DBAPI to be available.
+
+ Calls to context.execute() here emit the given string to the
+ script output.
+
+ """
+ url = config.get_main_option("sqlalchemy.url")
+ context.configure(
+ url=url,
+ target_metadata=target_metadata,
+ literal_binds=True,
+ dialect_opts={"paramstyle": "named"},
+ )
+
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+def run_migrations_online() -> None:
+ """Run migrations in 'online' mode.
+
+ In this scenario we need to create an Engine
+ and associate a connection with the context.
+
+ """
+ connectable = engine_from_config(
+ config.get_section(config.config_ini_section, {}),
+ prefix="sqlalchemy.",
+ poolclass=pool.NullPool,
+ )
+
+ with connectable.connect() as connection:
+ context.configure(connection=connection, target_metadata=target_metadata)
+
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+if context.is_offline_mode():
+ run_migrations_offline()
+else:
+ run_migrations_online()
diff --git a/migrations/db/script.py.mako b/migrations/db/script.py.mako
new file mode 100644
index 0000000..fbc4b07
--- /dev/null
+++ b/migrations/db/script.py.mako
@@ -0,0 +1,26 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+ ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+ ${downgrades if downgrades else "pass"}
diff --git a/migrations/db/versions/9fab027609bc_create_scans_table.py b/migrations/db/versions/9fab027609bc_create_scans_table.py
new file mode 100644
index 0000000..18f78b7
--- /dev/null
+++ b/migrations/db/versions/9fab027609bc_create_scans_table.py
@@ -0,0 +1,42 @@
+"""Create scans table
+
+Revision ID: 9fab027609bc
+Revises: ac05203c65bd
+Create Date: 2024-06-17 20:29:24.694581
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = "9fab027609bc"
+down_revision: Union[str, None] = "ac05203c65bd"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ op.create_table(
+ "scans",
+ sa.Column("uuid", sa.String(), primary_key=True),
+ sa.Column("job_uuid", sa.String(), nullable=False),
+ sa.Column("scan_identifier", sa.String(), nullable=False),
+ sa.Column("scm", sa.String(), nullable=False),
+ sa.Column("organization", sa.String(), nullable=True),
+ sa.Column("repository", sa.String(), nullable=False),
+ sa.Column("scan_context", sa.String(), nullable=False),
+ sa.Column("started_on", sa.DateTime(), nullable=False),
+ sa.Column("completed_on", sa.DateTime(), nullable=False),
+ sa.Column("status", sa.String(), nullable=False),
+ sa.Column("scan_mode", sa.String(), nullable=True),
+ sa.Column("scan_type", sa.String(), nullable=False),
+ sa.Column("metadata", sa.JSON(), nullable=True),
+ )
+
+
+def downgrade() -> None:
+ op.drop_table("scans")
diff --git a/migrations/db/versions/ac05203c65bd_create_findings_table.py b/migrations/db/versions/ac05203c65bd_create_findings_table.py
new file mode 100644
index 0000000..2d29cde
--- /dev/null
+++ b/migrations/db/versions/ac05203c65bd_create_findings_table.py
@@ -0,0 +1,53 @@
+"""Create findings/secrets table
+
+Revision ID: ac05203c65bd
+Revises:
+Create Date: 2024-06-17 18:59:55.247810
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = "ac05203c65bd"
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ op.create_table(
+ "findings",
+ sa.Column("uuid", sa.String(), primary_key=True),
+ sa.Column("scan_uuid", sa.String(), nullable=False),
+ sa.Column("job_uuid", sa.String(), nullable=False),
+ sa.Column("organization", sa.String(), nullable=True),
+ sa.Column("scan_context", sa.String(), nullable=False),
+ sa.Column("created_on", sa.DateTime(), nullable=False),
+ sa.Column("decoder_name", sa.String(), nullable=False),
+ sa.Column("detector_name", sa.String(), nullable=False),
+ sa.Column("detector_type", sa.Integer(), nullable=False),
+ sa.Column("raw", sa.VARCHAR(), nullable=False),
+ sa.Column("raw_v2", sa.VARCHAR(), nullable=True),
+ sa.Column("redacted", sa.String(), nullable=True),
+ sa.Column("source_name", sa.String(), nullable=False),
+ sa.Column("source_type", sa.Integer(), nullable=False),
+ sa.Column("verified", sa.Boolean(), nullable=False),
+ sa.Column("extra_data", sa.JSON(), nullable=True),
+ sa.Column("repository", sa.String(), nullable=True),
+ sa.Column("filename", sa.String(), nullable=False),
+ sa.Column("commit_hash", sa.String(), nullable=True),
+ sa.Column("committer_email", sa.String(), nullable=True),
+ sa.Column("commit_timestamp", sa.DateTime(), nullable=True),
+ sa.Column("line_number", sa.Integer(), nullable=False),
+ sa.Column("is_still_valid", sa.Boolean(), nullable=False),
+ sa.Column("last_validated_on", sa.DateTime(), nullable=False),
+ )
+
+
+def downgrade() -> None:
+ op.drop_table("findings")
diff --git a/migrations/db/versions/db4e0564e6df_create_jobs_table.py b/migrations/db/versions/db4e0564e6df_create_jobs_table.py
new file mode 100644
index 0000000..7b18ff4
--- /dev/null
+++ b/migrations/db/versions/db4e0564e6df_create_jobs_table.py
@@ -0,0 +1,38 @@
+"""create jobs table
+
+Revision ID: db4e0564e6df
+Revises: 9fab027609bc
+Create Date: 2024-06-22 13:09:36.346009
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = "db4e0564e6df"
+down_revision: Union[str, None] = "9fab027609bc"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ op.create_table(
+ "jobs",
+ sa.Column("uuid", sa.String(), primary_key=True),
+ sa.Column("scan_identifier", sa.String(), nullable=False),
+ sa.Column("scm", sa.String(), nullable=False),
+ sa.Column("scan_context", sa.String(), nullable=False),
+ sa.Column("started_on", sa.DateTime(), nullable=False),
+ sa.Column("completed_on", sa.DateTime(), nullable=False),
+ sa.Column("status", sa.String(), nullable=False),
+ sa.Column("scan_mode", sa.String(), nullable=True),
+ sa.Column("scan_type", sa.String(), nullable=False),
+ )
+
+
+def downgrade() -> None:
+ op.drop_table("jobs")
diff --git a/migrations/migrate.py b/migrations/migrate.py
new file mode 100644
index 0000000..0e71f46
--- /dev/null
+++ b/migrations/migrate.py
@@ -0,0 +1,24 @@
+import logging
+from alembic.config import Config
+from alembic import command
+
+
+def migrate(event, context):
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(levelname)s - %(message)s",
+ handlers=[logging.StreamHandler()],
+ )
+
+ logging.info("Starting Alembic upgrade")
+ # Set up the Alembic configuration
+ alembic_cfg = Config("alembic.ini")
+
+ # Run the Alembic upgrade command
+ command.upgrade(alembic_cfg, "head")
+
+ return {"statusCode": 200, "body": "Alembic upgrade successful!"}
+
+
+if __name__ == "__main__":
+ migrate({}, {})
diff --git a/migrations/pyproject.toml b/migrations/pyproject.toml
new file mode 100644
index 0000000..1b641e7
--- /dev/null
+++ b/migrations/pyproject.toml
@@ -0,0 +1,20 @@
+[tool.poetry]
+name = "migrate"
+version = "0.1.0"
+description = "DB configuration and migration scripts"
+authors = ["Thomson Reuters "]
+license = "MIT"
+readme = "README.md"
+include = ["alembic.ini", "db/*", "db/versions/*"]
+
+[tool.poetry.dependencies]
+python = "^3.9"
+alembic = "^1.13.1"
+psycopg2-binary = "^2.9.9"
+
+[tool.poetry-plugin-lambda-build]
+package_artifact_path = "migration.zip"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"