diff --git a/.github/workflows/base-lambdas-reusable-deploy-all.yml b/.github/workflows/base-lambdas-reusable-deploy-all.yml
index 40c2a65e4..b77b59140 100644
--- a/.github/workflows/base-lambdas-reusable-deploy-all.yml
+++ b/.github/workflows/base-lambdas-reusable-deploy-all.yml
@@ -316,4 +316,32 @@ jobs:
lambda_aws_name: UpdateUploadStateLambda
lambda_layer_names: "core_lambda_layer"
secrets:
- AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }}
\ No newline at end of file
+ AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }}
+
+ deploy_data_collection_lambda:
+ name: Deploy data collection lambda
+ uses: ./.github/workflows/base-lambdas-reusable-deploy.yml
+ with:
+ environment: ${{ inputs.environment}}
+ python_version: ${{ inputs.python_version }}
+ build_branch: ${{ inputs.build_branch}}
+ sandbox: ${{ inputs.sandbox }}
+ lambda_handler_name: data_collection_handler
+ lambda_aws_name: DataCollectionLambda
+ lambda_layer_names: "core_lambda_layer,data_lambda_layer"
+ secrets:
+ AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }}
+
+ deploy_statistical_report_lambda:
+ name: Deploy statistical report lambda
+ uses: ./.github/workflows/base-lambdas-reusable-deploy.yml
+ with:
+ environment: ${{ inputs.environment}}
+ python_version: ${{ inputs.python_version }}
+ build_branch: ${{ inputs.build_branch}}
+ sandbox: ${{ inputs.sandbox }}
+ lambda_handler_name: statistical_report_handler
+ lambda_aws_name: StatisticalReportLambda
+ lambda_layer_names: "core_lambda_layer,data_lambda_layer"
+ secrets:
+ AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }}
diff --git a/.gitignore b/.gitignore
index 59acc5ef5..65d4573e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,4 +101,7 @@ node_modules/
lambdas/tests/unit/helpers/data/pdf/tmp
/lambdas/package_/
-batch_update_progress.json
\ No newline at end of file
+batch_update_progress.json
+
+# jupyter notebook files
+*.ipynb
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 1908f9f25..f554b76bb 100644
--- a/Makefile
+++ b/Makefile
@@ -56,8 +56,7 @@ test-unit-coverage:
cd ./lambdas && ./venv/bin/python3 -m pytest --cov=. --cov-report xml:coverage.xml
test-unit-coverage-html:
- cd ./lambdas
- coverage run --source=. --omit=tests/* -m pytest -v tests && coverage report && coverage html
+ cd ./lambdas && coverage run --source=. --omit="tests/*" -m pytest -v tests && coverage report && coverage html
test-unit-collect:
cd ./lambdas && ./venv/bin/python3 -m pytest tests/ --collect-only
diff --git a/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx b/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx
index 0c99835e1..364f2076b 100644
--- a/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx
+++ b/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx
@@ -1,6 +1,6 @@
import React, { Dispatch, SetStateAction, useEffect, useRef, useState } from 'react';
import useTitle from '../../../../helpers/hooks/useTitle';
-import { BackLink, Button, Table, WarningCallout } from 'nhsuk-react-components';
+import { Button, Table, WarningCallout } from 'nhsuk-react-components';
import LinkButton from '../../../generic/linkButton/LinkButton';
import { SearchResult } from '../../../../types/generic/searchResult';
import getDocumentSearchResults from '../../../../helpers/requests/getDocumentSearchResults';
@@ -23,6 +23,7 @@ import { DOCUMENT_TYPE } from '../../../../types/pages/UploadDocumentsPage/types
import DeleteResultStage from '../deleteResultStage/DeleteResultStage';
import { DOWNLOAD_STAGE } from '../../../../types/generic/downloadStage';
import PatientSummary from '../../../generic/patientSummary/PatientSummary';
+import BackButton from '../../../generic/backButton/BackButton';
export type Props = {
numberOfFiles: number;
@@ -89,16 +90,7 @@ function RemoveRecordStage({ numberOfFiles, recordType, setDownloadStage }: Prop
const PageIndexView = () => (
<>
- {
- e.preventDefault();
- navigate(routes.LLOYD_GEORGE);
- }}
- >
- Go back
-
+
Remove this {recordType} record
Before removing
diff --git a/app/src/components/generic/backButton/BackButton.test.tsx b/app/src/components/generic/backButton/BackButton.test.tsx
index 06f6a7e9d..2e31cf500 100644
--- a/app/src/components/generic/backButton/BackButton.test.tsx
+++ b/app/src/components/generic/backButton/BackButton.test.tsx
@@ -26,7 +26,7 @@ describe('BackButton', () => {
mockPathname = { pathname: testUrl };
render();
- userEvent.click(screen.getByText('Back'));
+ userEvent.click(screen.getByText('Go back'));
await waitFor(() => {
expect(mockUseNavigate).toHaveBeenCalledWith(-1);
diff --git a/app/src/components/generic/backButton/BackButton.tsx b/app/src/components/generic/backButton/BackButton.tsx
index e76ad9a2f..b151b2b95 100644
--- a/app/src/components/generic/backButton/BackButton.tsx
+++ b/app/src/components/generic/backButton/BackButton.tsx
@@ -12,8 +12,8 @@ const BackButton = () => {
};
return (
-
- Back
+
+ Go back
);
};
diff --git a/lambdas/enums/supported_document_types.py b/lambdas/enums/supported_document_types.py
index f97a7e5fe..a9a0bca0d 100644
--- a/lambdas/enums/supported_document_types.py
+++ b/lambdas/enums/supported_document_types.py
@@ -39,3 +39,10 @@ def get_dynamodb_table_name(self) -> str:
SupportedDocumentTypes.LG: os.getenv("LLOYD_GEORGE_DYNAMODB_NAME"),
}
return document_type_to_table_name[self]
+
+ def get_s3_bucket_name(self) -> str:
+ lookup_dict = {
+ SupportedDocumentTypes.ARF: os.getenv("DOCUMENT_STORE_BUCKET_NAME"),
+ SupportedDocumentTypes.LG: os.getenv("LLOYD_GEORGE_BUCKET_NAME"),
+ }
+ return lookup_dict[self]
diff --git a/lambdas/handlers/data_collection_handler.py b/lambdas/handlers/data_collection_handler.py
new file mode 100644
index 000000000..673a84d3e
--- /dev/null
+++ b/lambdas/handlers/data_collection_handler.py
@@ -0,0 +1,23 @@
+from services.data_collection_service import DataCollectionService
+from utils.audit_logging_setup import LoggingService
+from utils.decorators.ensure_env_var import ensure_environment_variables_for_non_webapi
+from utils.decorators.override_error_check import override_error_check
+
+logger = LoggingService(__name__)
+
+
+@ensure_environment_variables_for_non_webapi(
+ names=[
+ "LLOYD_GEORGE_DYNAMODB_NAME",
+ "LLOYD_GEORGE_BUCKET_NAME",
+ "DOCUMENT_STORE_DYNAMODB_NAME",
+ "DOCUMENT_STORE_BUCKET_NAME",
+ "WORKSPACE",
+ "STATISTICS_TABLE",
+ ]
+)
+@override_error_check
+def lambda_handler(_event, _context):
+ logger.info("Starting data collection process")
+ service = DataCollectionService()
+ service.collect_all_data_and_write_to_dynamodb()
diff --git a/lambdas/handlers/statistical_report_handler.py b/lambdas/handlers/statistical_report_handler.py
new file mode 100644
index 000000000..8b701c7d5
--- /dev/null
+++ b/lambdas/handlers/statistical_report_handler.py
@@ -0,0 +1,20 @@
+from services.statistical_report_service import StatisticalReportService
+from utils.audit_logging_setup import LoggingService
+from utils.decorators.ensure_env_var import ensure_environment_variables_for_non_webapi
+from utils.decorators.override_error_check import override_error_check
+
+logger = LoggingService(__name__)
+
+
+@ensure_environment_variables_for_non_webapi(
+ names=[
+ "WORKSPACE",
+ "STATISTICS_TABLE",
+ "STATISTICAL_REPORTS_BUCKET",
+ ]
+)
+@override_error_check
+def lambda_handler(_event, _context):
+ logger.info("Starting creating statistical report")
+ service = StatisticalReportService()
+ service.make_weekly_summary_and_output_to_bucket()
diff --git a/lambdas/models/statistics.py b/lambdas/models/statistics.py
new file mode 100644
index 000000000..5e5471221
--- /dev/null
+++ b/lambdas/models/statistics.py
@@ -0,0 +1,88 @@
+import uuid
+from decimal import Decimal
+from typing import NamedTuple
+
+from models.config import to_capitalized_camel
+from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
+
+
+class StatisticData(BaseModel):
+ model_config = ConfigDict(
+ alias_generator=to_capitalized_camel, populate_by_name=True
+ )
+ statistic_id: str = Field(
+ default_factory=lambda: str(uuid.uuid4()), alias="StatisticID"
+ )
+ date: str
+ ods_code: str
+
+ @field_serializer("statistic_id")
+ def serialise_id(self, statistic_id) -> str:
+ return f"{self.__class__.__name__}#{statistic_id}"
+
+ # noinspection PyNestedDecorators
+ @field_validator("statistic_id")
+ @classmethod
+ def deserialize_id(cls, raw_statistic_id: str) -> str:
+ if "#" in raw_statistic_id:
+ record_type, uuid_part = raw_statistic_id.split("#")
+ class_name = cls.__name__
+ assert (
+ record_type == class_name
+ ), f"StatisticID must be in the form of `{class_name}#uuid`"
+ else:
+ uuid_part = raw_statistic_id
+
+ return uuid_part
+
+ # noinspection PyNestedDecorators
+ @field_validator("ods_code")
+ @classmethod
+ def fill_empty_ods_code(cls, ods_code: str) -> str:
+ if not ods_code:
+ return "NO_ODS_CODE"
+ return ods_code
+
+
+class RecordStoreData(StatisticData):
+ total_number_of_records: int = 0
+ number_of_document_types: int = 0
+ total_size_of_records_in_megabytes: Decimal = Decimal(0)
+ average_size_of_documents_per_patient_in_megabytes: Decimal = Decimal(0)
+
+
+class OrganisationData(StatisticData):
+ number_of_patients: int = 0
+ average_records_per_patient: Decimal = Decimal(0)
+ daily_count_stored: int = 0
+ daily_count_viewed: int = 0
+ daily_count_downloaded: int = 0
+ daily_count_deleted: int = 0
+
+
+class ApplicationData(StatisticData):
+ active_user_ids_hashed: list[str] = []
+
+
+class LoadedStatisticData(NamedTuple):
+ record_store_data: list[RecordStoreData]
+ organisation_data: list[OrganisationData]
+ application_data: list[ApplicationData]
+
+
+def load_from_dynamodb_items(dynamodb_items: list[dict]) -> LoadedStatisticData:
+ output = LoadedStatisticData([], [], [])
+
+ for item in dynamodb_items:
+ data_type = item["StatisticID"].split("#")[0]
+ match data_type:
+ case "RecordStoreData":
+ output.record_store_data.append(RecordStoreData.model_validate(item))
+ case "OrganisationData":
+ output.organisation_data.append(OrganisationData.model_validate(item))
+ case "ApplicationData":
+ output.application_data.append(ApplicationData.model_validate(item))
+ case _:
+ raise ValueError(f"unknown type of statistic data: {data_type}")
+
+ return output
diff --git a/lambdas/requirements/requirements_test.txt b/lambdas/requirements/requirements_test.txt
index ad58448c0..18c7af468 100644
--- a/lambdas/requirements/requirements_test.txt
+++ b/lambdas/requirements/requirements_test.txt
@@ -6,6 +6,7 @@ isort==5.13.0
pip-audit==2.6.1
pytest-cov==4.1.0
pytest-mock==3.11.1
+pytest-unordered==0.6.0
pytest==7.4.3
requests_mock==1.11.0
ruff==0.0.284
diff --git a/lambdas/services/authoriser_service.py b/lambdas/services/authoriser_service.py
index 6cc107d63..42fb76c16 100644
--- a/lambdas/services/authoriser_service.py
+++ b/lambdas/services/authoriser_service.py
@@ -76,7 +76,7 @@ def find_login_session(self, ndr_session_id):
)
session_table_name = os.environ["AUTH_SESSION_TABLE_NAME"]
db_service = DynamoDBService()
- query_response = db_service.simple_query(
+ query_response = db_service.query_all_fields(
table_name=session_table_name,
key_condition_expression=Key("NDRSessionId").eq(ndr_session_id),
)
diff --git a/lambdas/services/base/cloudwatch_service.py b/lambdas/services/base/cloudwatch_service.py
new file mode 100644
index 000000000..5b30c6fa3
--- /dev/null
+++ b/lambdas/services/base/cloudwatch_service.py
@@ -0,0 +1,59 @@
+import os
+import time
+
+import boto3
+from utils.audit_logging_setup import LoggingService
+from utils.cloudwatch_logs_query import CloudwatchLogsQueryParams
+from utils.exceptions import LogsQueryException
+
+logger = LoggingService(__name__)
+
+
+class CloudwatchService:
+ def __init__(self):
+ self.logs_client = boto3.client("logs")
+ self.workspace = os.environ["WORKSPACE"]
+ self.initialised = True
+
+ def query_logs(
+ self, query_params: CloudwatchLogsQueryParams, start_time: int, end_time: int
+ ) -> list[dict]:
+ response = self.logs_client.start_query(
+ logGroupName=f"/aws/lambda/{self.workspace}_{query_params.lambda_name}",
+ startTime=start_time,
+ endTime=end_time,
+ queryString=query_params.query_string,
+ )
+ query_id = response["queryId"]
+
+ raw_query_result = self.poll_query_result(query_id)
+ query_result = self.regroup_raw_query_result(raw_query_result)
+ return query_result
+
+ def poll_query_result(self, query_id: str, max_retries=20) -> list[list]:
+ for _ in range(max_retries):
+ response = self.logs_client.get_query_results(queryId=query_id)
+ if response["status"] == "Complete":
+ return response["results"]
+ elif response["status"] in ["Failed", "Cancelled", "Timeout"]:
+ self.log_and_raise_error(
+ f"Logs query failed with status: {response['status']}"
+ )
+ time.sleep(1)
+
+ self.log_and_raise_error(
+ f"Failed to get query result within max retries of {max_retries} times"
+ )
+
+ @staticmethod
+ def regroup_raw_query_result(raw_query_result: list[list[dict]]) -> list[dict]:
+ query_result = [
+ {column["field"]: column["value"] for column in row}
+ for row in raw_query_result
+ ]
+ return query_result
+
+ @staticmethod
+ def log_and_raise_error(error_message: str) -> None:
+ logger.error(error_message)
+ raise LogsQueryException(error_message)
diff --git a/lambdas/services/base/dynamo_service.py b/lambdas/services/base/dynamo_service.py
index 94adc1531..6fd9c592e 100644
--- a/lambdas/services/base/dynamo_service.py
+++ b/lambdas/services/base/dynamo_service.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
import boto3
from boto3.dynamodb.conditions import Attr, ConditionBase, Key
from botocore.exceptions import ClientError
@@ -76,7 +78,7 @@ def query_with_requested_fields(
logger.error(str(e), {"Result": f"Unable to query table: {table_name}"})
raise e
- def simple_query(self, table_name: str, key_condition_expression):
+ def query_all_fields(self, table_name: str, key_condition_expression):
"""
Allow querying dynamodb table without explicitly defining the fields to retrieve.
:param table_name: Dynamodb table name
@@ -85,7 +87,7 @@ def simple_query(self, table_name: str, key_condition_expression):
example usage:
from boto3.dynamodb.conditions import Key
- query_response = db_service.simple_query(
+ query_response = db_service.query_all_fields(
table_name=session_table_name,
key_condition_expression=Key("NDRSessionId").eq(ndr_session_id)
)
@@ -161,6 +163,35 @@ def scan_table(
logger.error(str(e), {"Result": f"Unable to scan table: {table_name}"})
raise e
+ def scan_whole_table(
+ self,
+ table_name: str,
+ project_expression: Optional[str] = None,
+ filter_expression: Optional[str] = None,
+ ) -> list[dict]:
+ try:
+ table = self.get_table(table_name)
+ scan_arguments = {}
+ if project_expression:
+ scan_arguments["ProjectionExpression"] = project_expression
+ if filter_expression:
+ scan_arguments["FilterExpression"] = filter_expression
+
+ paginated_result = table.scan(**scan_arguments)
+ dynamodb_scan_result = paginated_result.get("Items", [])
+ while "LastEvaluatedKey" in paginated_result:
+ start_key_for_next_page = paginated_result["LastEvaluatedKey"]
+ paginated_result = table.scan(
+ **scan_arguments,
+ ExclusiveStartKey=start_key_for_next_page,
+ )
+ dynamodb_scan_result += paginated_result["Items"]
+ return dynamodb_scan_result
+
+ except ClientError as e:
+ logger.error(str(e), {"Result": f"Unable to scan table: {table_name}"})
+ raise e
+
def batch_writing(self, table_name: str, item_list: list[dict]):
try:
table = self.get_table(table_name)
diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py
index 25ca9861c..cbcd33d07 100644
--- a/lambdas/services/base/s3_service.py
+++ b/lambdas/services/base/s3_service.py
@@ -132,3 +132,10 @@ def file_exist_on_s3(self, s3_bucket_name: str, file_key: str) -> bool:
return False
logger.error(str(e), {"Result": "Failed to check if file exists on s3"})
raise e
+
+ def list_all_objects(self, bucket_name: str) -> list[dict]:
+ s3_paginator = self.client.get_paginator("list_objects_v2")
+ s3_list_objects_result = []
+ for paginated_result in s3_paginator.paginate(Bucket=bucket_name):
+ s3_list_objects_result += paginated_result.get("Contents", [])
+ return s3_list_objects_result
diff --git a/lambdas/services/data_collection_service.py b/lambdas/services/data_collection_service.py
new file mode 100644
index 000000000..d1320cc2a
--- /dev/null
+++ b/lambdas/services/data_collection_service.py
@@ -0,0 +1,337 @@
+import hashlib
+import os
+from collections import Counter, defaultdict
+from datetime import datetime
+
+import polars as pl
+from enums.metadata_field_names import DocumentReferenceMetadataFields
+from enums.supported_document_types import SupportedDocumentTypes
+from models.statistics import (
+ ApplicationData,
+ OrganisationData,
+ RecordStoreData,
+ StatisticData,
+)
+from services.base.cloudwatch_service import CloudwatchService
+from services.base.dynamo_service import DynamoDBService
+from services.base.s3_service import S3Service
+from utils.audit_logging_setup import LoggingService
+from utils.cloudwatch_logs_query import (
+ CloudwatchLogsQueryParams,
+ LloydGeorgeRecordsDeleted,
+ LloydGeorgeRecordsDownloaded,
+ LloydGeorgeRecordsStored,
+ LloydGeorgeRecordsViewed,
+ UniqueActiveUserIds,
+)
+from utils.common_query_filters import UploadCompleted
+from utils.utilities import flatten, get_file_key_from_s3_url
+
+logger = LoggingService(__name__)
+
+Fields = DocumentReferenceMetadataFields
+OdsCodeFieldName: str = Fields.CURRENT_GP_ODS.value
+NhsNumberFieldName: str = Fields.NHS_NUMBER.value
+FileLocationFieldName: str = Fields.FILE_LOCATION.value
+ContentTypeFieldName: str = Fields.CONTENT_TYPE.value
+
+
+class DataCollectionService:
+ def __init__(self):
+ self.workspace = os.environ["WORKSPACE"]
+ self.output_table_name = os.environ["STATISTICS_TABLE"]
+
+ self.cloudwatch_service = CloudwatchService()
+ self.dynamodb_service = DynamoDBService()
+ self.s3_service = S3Service()
+
+ one_day = 60 * 60 * 24
+ time_now = int(datetime.now().timestamp())
+
+ self.collection_start_time = time_now - one_day
+ self.collection_end_time = time_now
+ self.today_date = datetime.today().strftime("%Y%m%d")
+
+ def collect_all_data_and_write_to_dynamodb(self):
+ time_period_human_readable = (
+ f"{datetime.fromtimestamp(self.collection_start_time)}"
+ f" ~ {datetime.fromtimestamp(self.collection_end_time)}"
+ )
+ logger.info(f"Collecting data between {time_period_human_readable}.")
+
+ all_statistic_data = self.collect_all_data()
+ logger.info("Finished collecting data. Will output to dynamodb table.")
+
+ self.write_to_dynamodb_table(all_statistic_data)
+ logger.info("Data collection completed.", {"Result": "Successful"})
+
+ def collect_all_data(self) -> list[StatisticData]:
+ dynamodb_scan_result = self.scan_dynamodb_tables()
+ s3_list_objects_result = self.get_all_s3_files_info()
+
+ record_store_data = self.get_record_store_data(
+ dynamodb_scan_result, s3_list_objects_result
+ )
+ organisation_data = self.get_organisation_data(dynamodb_scan_result)
+ application_data = self.get_application_data()
+
+ return record_store_data + organisation_data + application_data
+
+ def write_to_dynamodb_table(self, all_statistic_data: list[StatisticData]):
+ logger.info("Writing statistic data to dynamodb table")
+ item_list = []
+ for entry in all_statistic_data:
+ item_list.append(entry.model_dump(by_alias=True))
+
+ self.dynamodb_service.batch_writing(
+ table_name=self.output_table_name, item_list=item_list
+ )
+
+ logger.info("Finish writing all data to dynamodb table")
+
+ def scan_dynamodb_tables(self) -> list[dict]:
+ all_results = []
+
+ field_names_to_fetch = [
+ OdsCodeFieldName,
+ NhsNumberFieldName,
+ FileLocationFieldName,
+ ContentTypeFieldName,
+ ]
+ project_expression = ",".join(field_names_to_fetch)
+ filter_expression = UploadCompleted
+
+ for doc_type in SupportedDocumentTypes.list():
+ table_name = doc_type.get_dynamodb_table_name()
+ result = self.dynamodb_service.scan_whole_table(
+ table_name=table_name,
+ project_expression=project_expression,
+ filter_expression=filter_expression,
+ )
+ all_results.extend(result)
+
+ return all_results
+
+ def get_all_s3_files_info(self) -> list[dict]:
+ all_results = []
+ for doc_type in SupportedDocumentTypes.list():
+ bucket_name = doc_type.get_s3_bucket_name()
+ result = self.s3_service.list_all_objects(bucket_name)
+ all_results += result
+
+ return all_results
+
+ def get_record_store_data(
+ self,
+ dynamodb_scan_result: list[dict],
+ s3_list_objects_result: list[dict],
+ ) -> list[RecordStoreData]:
+ total_number_of_records = self.get_total_number_of_records(dynamodb_scan_result)
+
+ total_and_average_file_sizes = (
+ self.get_metrics_for_total_and_average_file_sizes(
+ dynamodb_scan_result, s3_list_objects_result
+ )
+ )
+
+ number_of_document_types = self.get_number_of_document_types(
+ dynamodb_scan_result
+ )
+
+ joined_query_result = self.join_results_by_ods_code(
+ [
+ total_number_of_records,
+ total_and_average_file_sizes,
+ number_of_document_types,
+ ]
+ )
+
+ record_store_data_for_all_ods_code = [
+ RecordStoreData(
+ date=self.today_date,
+ **record_store_data_properties,
+ )
+ for record_store_data_properties in joined_query_result
+ ]
+
+ return record_store_data_for_all_ods_code
+
+ def get_organisation_data(
+ self, dynamodb_scan_result: list[dict]
+ ) -> list[OrganisationData]:
+ number_of_patients = self.get_number_of_patients(dynamodb_scan_result)
+ average_records_per_patient = self.get_average_number_of_files_per_patient(
+ dynamodb_scan_result
+ )
+ daily_count_viewed = self.get_cloud_watch_query_result(LloydGeorgeRecordsViewed)
+ daily_count_downloaded = self.get_cloud_watch_query_result(
+ LloydGeorgeRecordsDownloaded
+ )
+ daily_count_deleted = self.get_cloud_watch_query_result(
+ LloydGeorgeRecordsDeleted
+ )
+ daily_count_stored = self.get_cloud_watch_query_result(LloydGeorgeRecordsStored)
+
+ joined_query_result = self.join_results_by_ods_code(
+ [
+ number_of_patients,
+ average_records_per_patient,
+ daily_count_viewed,
+ daily_count_downloaded,
+ daily_count_deleted,
+ daily_count_stored,
+ ]
+ )
+
+ organisation_data_for_all_ods_code = [
+ OrganisationData(date=self.today_date, **organisation_data_properties)
+ for organisation_data_properties in joined_query_result
+ ]
+
+ return organisation_data_for_all_ods_code
+
+ def get_application_data(self) -> list[ApplicationData]:
+ user_id_per_ods_code = self.get_active_user_list()
+ application_data_for_all_ods_code = [
+ ApplicationData(
+ date=self.today_date,
+ active_user_ids_hashed=active_user_ids_hashed,
+ ods_code=ods_code,
+ )
+ for ods_code, active_user_ids_hashed in user_id_per_ods_code.items()
+ ]
+ return application_data_for_all_ods_code
+
+ def get_active_user_list(self) -> dict[str, list]:
+ query_result = self.get_cloud_watch_query_result(
+ query_params=UniqueActiveUserIds
+ )
+ user_ids_per_ods_code = defaultdict(list)
+ for entry in query_result:
+ ods_code = entry.get("ods_code")
+ user_id = entry.get("user_id")
+ hashed_user_id = hashlib.sha256(bytes(user_id, "utf8")).hexdigest()
+ user_ids_per_ods_code[ods_code].append(hashed_user_id)
+
+ return user_ids_per_ods_code
+
+ def get_cloud_watch_query_result(
+ self, query_params: CloudwatchLogsQueryParams
+ ) -> list[dict]:
+ return self.cloudwatch_service.query_logs(
+ query_params=query_params,
+ start_time=self.collection_start_time,
+ end_time=self.collection_end_time,
+ )
+
+ def get_total_number_of_records(
+ self, dynamodb_scan_result: list[dict]
+ ) -> list[dict]:
+ ods_code_for_every_document = [
+ item[OdsCodeFieldName] for item in dynamodb_scan_result
+ ]
+ counted_by_ods_code = Counter(ods_code_for_every_document)
+ count_result_in_list_of_dict = [
+ {"ods_code": key, "total_number_of_records": value}
+ for key, value in counted_by_ods_code.items()
+ ]
+ return count_result_in_list_of_dict
+
+ def get_number_of_patients(self, dynamodb_scan_result: list[dict]) -> list[dict]:
+ patients_grouped_by_ods_code = defaultdict(set)
+ for item in dynamodb_scan_result:
+ ods_code = item[OdsCodeFieldName]
+ patients_grouped_by_ods_code[ods_code].add(item[NhsNumberFieldName])
+
+ return [
+ {"ods_code": ods_code, "number_of_patients": len(patients)}
+ for ods_code, patients in patients_grouped_by_ods_code.items()
+ ]
+
+ def get_metrics_for_total_and_average_file_sizes(
+ self,
+ dynamodb_scan_result: list[dict],
+ s3_list_objects_result: list[dict],
+ ) -> list[dict]:
+ dynamodb_df = pl.DataFrame(dynamodb_scan_result)
+ s3_df = pl.DataFrame(s3_list_objects_result)
+
+ dynamodb_df_with_file_key = dynamodb_df.with_columns(
+ pl.col(FileLocationFieldName)
+ .map_elements(get_file_key_from_s3_url, return_dtype=pl.String)
+ .alias("S3FileKey")
+ )
+ joined_df = dynamodb_df_with_file_key.join(
+ s3_df, how="left", left_on="S3FileKey", right_on="Key", coalesce=True
+ )
+
+ get_total_size = (pl.col("Size").sum() / 1024 / 1024).alias(
+ "TotalFileSizeForPatientInMegabytes"
+ )
+
+ get_average_file_size_per_patient = (
+ pl.col("TotalFileSizeForPatientInMegabytes")
+ .mean()
+ .alias("average_size_of_documents_per_patient_in_megabytes")
+ )
+
+ get_total_file_size_for_ods_code = (
+ pl.col("TotalFileSizeForPatientInMegabytes")
+ .sum()
+ .alias("total_size_of_records_in_megabytes")
+ )
+
+ result = (
+ joined_df.group_by(OdsCodeFieldName, NhsNumberFieldName)
+ .agg(get_total_size)
+ .group_by(OdsCodeFieldName)
+ .agg(get_average_file_size_per_patient, get_total_file_size_for_ods_code)
+ .rename({OdsCodeFieldName: "ods_code"})
+ )
+
+ return result.to_dicts()
+
+ def get_number_of_document_types(
+ self, dynamodb_scan_result: list[dict]
+ ) -> list[dict]:
+ file_types_grouped_by_ods_code = defaultdict(set)
+ for item in dynamodb_scan_result:
+ ods_code = item[OdsCodeFieldName]
+ file_type = item[ContentTypeFieldName]
+ file_types_grouped_by_ods_code[ods_code].add(file_type)
+
+ return [
+ {"ods_code": ods_code, "number_of_document_types": len(file_types)}
+ for ods_code, file_types in file_types_grouped_by_ods_code.items()
+ ]
+
+ def get_average_number_of_files_per_patient(
+ self,
+ dynamodb_scan_result: list[dict],
+ ) -> list[dict]:
+ dynamodb_df = pl.DataFrame(dynamodb_scan_result)
+
+ count_records = pl.len().alias("number_of_records")
+ take_average_of_record_count = (
+ pl.col("number_of_records").mean().alias("average_records_per_patient")
+ )
+
+ result = (
+ dynamodb_df.group_by(OdsCodeFieldName, NhsNumberFieldName)
+ .agg(count_records)
+ .group_by(OdsCodeFieldName)
+ .agg(take_average_of_record_count)
+ .rename({OdsCodeFieldName: "ods_code"})
+ )
+ return result.to_dicts()
+
+ @staticmethod
+ def join_results_by_ods_code(results: list[list[dict]]) -> list[dict]:
+ joined_by_ods_code = defaultdict(dict)
+ for entry in flatten(results):
+ ods_code = entry["ods_code"]
+ joined_by_ods_code[ods_code].update(entry)
+
+ joined_result = list(joined_by_ods_code.values())
+
+ return joined_result
diff --git a/lambdas/services/login_service.py b/lambdas/services/login_service.py
index a18907cec..bfafc1d2b 100644
--- a/lambdas/services/login_service.py
+++ b/lambdas/services/login_service.py
@@ -142,7 +142,7 @@ def generate_session(self, state, auth_code) -> dict:
def have_matching_state_value_in_record(self, state: str) -> bool:
state_table_name = os.environ["AUTH_STATE_TABLE_NAME"]
- query_response = self.db_service.simple_query(
+ query_response = self.db_service.query_all_fields(
table_name=state_table_name, key_condition_expression=Key("State").eq(state)
)
diff --git a/lambdas/services/statistical_report_service.py b/lambdas/services/statistical_report_service.py
new file mode 100644
index 000000000..e2f527f32
--- /dev/null
+++ b/lambdas/services/statistical_report_service.py
@@ -0,0 +1,225 @@
+import os
+import shutil
+import tempfile
+from datetime import datetime, timedelta
+
+import polars as pl
+import polars.selectors as column_select
+from boto3.dynamodb.conditions import Key
+from inflection import humanize
+from models.statistics import (
+ ApplicationData,
+ LoadedStatisticData,
+ OrganisationData,
+ RecordStoreData,
+ StatisticData,
+ load_from_dynamodb_items,
+)
+from services.base.dynamo_service import DynamoDBService
+from services.base.s3_service import S3Service
+from utils.audit_logging_setup import LoggingService
+from utils.exceptions import StatisticDataNotFoundException
+
+logger = LoggingService(__name__)
+
+
+class StatisticalReportService:
+ def __init__(self):
+ self.dynamo_service = DynamoDBService()
+ self.statistic_table = os.environ["STATISTICS_TABLE"]
+
+ self.s3_service = S3Service()
+ self.reports_bucket = os.environ["STATISTICAL_REPORTS_BUCKET"]
+
+ last_seven_days = [
+ datetime.today() - timedelta(days=i) for i in range(7, 0, -1)
+ ]
+ self.dates_to_collect: list[str] = [
+ date.strftime("%Y%m%d") for date in last_seven_days
+ ]
+ self.report_period = f"{self.dates_to_collect[0]}-{self.dates_to_collect[-1]}"
+
+ def make_weekly_summary_and_output_to_bucket(self) -> None:
+ weekly_summary = self.make_weekly_summary()
+ self.store_report_to_s3(weekly_summary)
+
+ def make_weekly_summary(self) -> pl.DataFrame:
+ (record_store_data, organisation_data, application_data) = (
+ self.get_statistic_data()
+ )
+
+ weekly_record_store_data = self.summarise_record_store_data(record_store_data)
+ weekly_organisation_data = self.summarise_organisation_data(organisation_data)
+ weekly_application_data = self.summarise_application_data(application_data)
+
+ all_summarised_data = [
+ weekly_record_store_data,
+ weekly_organisation_data,
+ weekly_application_data,
+ ]
+
+ combined_data = self.join_dataframes_by_ods_code(all_summarised_data)
+ weekly_summary = self.tidy_up_data(combined_data)
+
+ return weekly_summary
+
+ def get_statistic_data(self) -> LoadedStatisticData:
+ logger.info("Loading statistic data of previous week from dynamodb...")
+ logger.info(f"The period to report: {self.dates_to_collect}")
+ dynamodb_items = []
+ for date in self.dates_to_collect:
+ response = self.dynamo_service.query_all_fields(
+ table_name=self.statistic_table,
+ key_condition_expression=Key("Date").eq(date),
+ )
+ dynamodb_items.extend(response["Items"])
+
+ loaded_data = load_from_dynamodb_items(dynamodb_items)
+
+ all_data_empty = all(not data for data in loaded_data)
+ if all_data_empty:
+ logger.error(
+ f"No statistic data can be found during the period {self.report_period}. "
+ "Please check whether the data collection lambda worked properly.",
+ {"Result": "Statistic data not available."},
+ )
+ raise StatisticDataNotFoundException(
+ f"No statistic data can be found during the period {self.report_period}"
+ )
+
+ return loaded_data
+
+ @staticmethod
+ def load_data_to_polars(data: list[StatisticData]) -> pl.DataFrame:
+ cast_decimal_to_float = column_select.by_dtype(pl.datatypes.Decimal).cast(
+ pl.Float64
+ )
+ loaded_data = pl.DataFrame(data).with_columns(cast_decimal_to_float)
+ return loaded_data
+
+ def summarise_record_store_data(
+ self, record_store_data: list[RecordStoreData]
+ ) -> pl.DataFrame:
+ logger.info("Summarising RecordStoreData...")
+ if not record_store_data:
+ logger.info("RecordStoreData for this period was empty.")
+ return pl.DataFrame()
+
+ df = self.load_data_to_polars(record_store_data)
+
+ select_most_recent_records = pl.all().sort_by("date").last()
+ summarised_data = (
+ df.group_by("ods_code")
+ .agg(select_most_recent_records)
+ .drop("date", "statistic_id")
+ )
+
+ return summarised_data
+
+ def summarise_organisation_data(
+ self, organisation_data: list[OrganisationData]
+ ) -> pl.DataFrame:
+ logger.info("Summarising OrganisationData...")
+ if not organisation_data:
+ logger.info("OrganisationData for this period was empty.")
+ return pl.DataFrame()
+
+ df = self.load_data_to_polars(organisation_data)
+
+ sum_daily_count_to_weekly = (
+ column_select.matches("daily")
+ .sum()
+ .name.map(lambda column_name: column_name.replace("daily", "weekly"))
+ )
+ take_average_for_patient_record = (
+ pl.col("average_records_per_patient").mean().round(3)
+ )
+ select_most_recent_number_of_patients = (
+ pl.col("number_of_patients").sort_by("date").last()
+ )
+ summarised_data = df.group_by("ods_code").agg(
+ sum_daily_count_to_weekly,
+ take_average_for_patient_record,
+ select_most_recent_number_of_patients,
+ )
+ return summarised_data
+
+ def summarise_application_data(
+ self, application_data: list[ApplicationData]
+ ) -> pl.DataFrame:
+ logger.info("Summarising ApplicationData...")
+ if not application_data:
+ logger.info("ApplicationData for this period was empty.")
+ return pl.DataFrame()
+
+ df = self.load_data_to_polars(application_data)
+
+ count_unique_ids = (
+ pl.concat_list("active_user_ids_hashed")
+ .flatten()
+ .unique()
+ .len()
+ .alias("active_users_count")
+ )
+ summarised_data = df.group_by("ods_code").agg(count_unique_ids)
+ return summarised_data
+
+ def join_dataframes_by_ods_code(
+ self, all_summarised_data: list[pl.DataFrame]
+ ) -> pl.DataFrame:
+ data_to_report = [df for df in all_summarised_data if not df.is_empty()]
+ joined_dataframe = data_to_report[0]
+
+ for other_dataframe in data_to_report[1:]:
+ joined_dataframe = joined_dataframe.join(
+ other_dataframe, on="ods_code", how="outer_coalesce"
+ )
+
+ return joined_dataframe
+
+ def tidy_up_data(self, joined_data: pl.DataFrame) -> pl.DataFrame:
+ with_date_column_updated = self.update_date_column(joined_data)
+ with_columns_reordered = self.reorder_columns(with_date_column_updated)
+ with_columns_renamed = with_columns_reordered.rename(
+ self.rename_snakecase_columns
+ )
+
+ return with_columns_renamed
+
+ def update_date_column(self, joined_data: pl.DataFrame) -> pl.DataFrame:
+ date_column_filled_with_report_period = joined_data.with_columns(
+ pl.lit(self.report_period).alias("date")
+ )
+ return date_column_filled_with_report_period
+
+ def reorder_columns(self, joined_data: pl.DataFrame) -> pl.DataFrame:
+ all_columns_names = joined_data.columns
+ columns_to_go_first = ["date", "ods_code"]
+ other_columns = sorted(set(all_columns_names) - set(columns_to_go_first))
+ with_columns_reordered = joined_data.select(
+ *columns_to_go_first, *other_columns
+ )
+ return with_columns_reordered
+
+ @staticmethod
+ def rename_snakecase_columns(column_name: str) -> str:
+ if column_name == "ods_code":
+ return "ODS code"
+ else:
+ return humanize(column_name)
+
+ def store_report_to_s3(self, weekly_summary: pl.DataFrame) -> None:
+ logger.info("Saving the weekly report as .csv")
+ file_name = f"statistical_report_{self.report_period}.csv"
+ temp_folder = tempfile.mkdtemp()
+ local_file_path = os.path.join(temp_folder, file_name)
+ try:
+ weekly_summary.write_csv(local_file_path)
+
+ logger.info("Uploading the csv report to S3 bucket...")
+ self.s3_service.upload_file(local_file_path, self.reports_bucket, file_name)
+
+ logger.info("The weekly report is stored in s3 bucket.")
+ logger.info(f"File name: {file_name}")
+ finally:
+ shutil.rmtree(temp_folder)
diff --git a/lambdas/tests/unit/conftest.py b/lambdas/tests/unit/conftest.py
index 5eea7fcff..9fc7c630e 100644
--- a/lambdas/tests/unit/conftest.py
+++ b/lambdas/tests/unit/conftest.py
@@ -1,4 +1,5 @@
import json
+import tempfile
from dataclasses import dataclass
from enum import Enum
from unittest import mock
@@ -44,6 +45,8 @@
MOCK_APPCONFIG_APPLICATION_ENV_NAME = "APPCONFIG_APPLICATION"
MOCK_APPCONFIG_ENVIRONMENT_ENV_NAME = "APPCONFIG_ENVIRONMENT"
MOCK_APPCONFIG_CONFIGURATION_ENV_NAME = "APPCONFIG_CONFIGURATION"
+MOCK_STATISTICS_TABLE_NAME = "STATISTICS_TABLE"
+MOCK_STATISTICS_REPORT_BUCKET_NAME = "STATISTICAL_REPORTS_BUCKET"
MOCK_ARF_TABLE_NAME = "test_arf_dynamoDB_table"
MOCK_LG_TABLE_NAME = "test_lg_dynamoDB_table"
@@ -55,6 +58,8 @@
MOCK_LG_STAGING_STORE_BUCKET = "test_staging_bulk_store"
MOCK_LG_METADATA_SQS_QUEUE = "test_bulk_upload_metadata_queue"
MOCK_LG_INVALID_SQS_QUEUE = "INVALID_SQS_QUEUE_URL"
+MOCK_STATISTICS_TABLE = "test_statistics_table"
+MOCK_STATISTICS_REPORT_BUCKET = "test_statistics_report_bucket"
TEST_NHS_NUMBER = "9000000009"
TEST_OBJECT_KEY = "1234-4567-8912-HSDF-TEST"
@@ -137,16 +142,20 @@ def set_env(monkeypatch):
)
monkeypatch.setenv(
MOCK_APPCONFIG_APPLICATION_ENV_NAME, MOCK_APPCONFIG_APPLICATION_ID
- ),
+ )
monkeypatch.setenv(
MOCK_APPCONFIG_ENVIRONMENT_ENV_NAME, MOCK_APPCONFIG_ENVIRONMENT_ID
- ),
+ )
monkeypatch.setenv(
MOCK_APPCONFIG_CONFIGURATION_ENV_NAME, MOCK_APPCONFIG_CONFIGURATION_ID
)
monkeypatch.setenv(
MOCK_PRESIGNED_URL_ROLE_ARN_KEY, MOCK_PRESIGNED_URL_ROLE_ARN_VALUE
)
+ monkeypatch.setenv(MOCK_STATISTICS_TABLE_NAME, MOCK_STATISTICS_TABLE)
+ monkeypatch.setenv(
+ MOCK_STATISTICS_REPORT_BUCKET_NAME, MOCK_STATISTICS_REPORT_BUCKET
+ )
@pytest.fixture(scope="session", autouse=True)
@@ -235,3 +244,10 @@ class MockError(Enum):
"err_code": "AB_XXXX",
"interaction_id": "88888888-4444-4444-4444-121212121212",
}
+
+
+@pytest.fixture
+def mock_temp_folder(mocker):
+ temp_folder = tempfile.mkdtemp()
+ mocker.patch.object(tempfile, "mkdtemp", return_value=temp_folder)
+ yield temp_folder
diff --git a/lambdas/tests/unit/enums/test_supported_document_types.py b/lambdas/tests/unit/enums/test_supported_document_types.py
index bd69cc1d1..c2685abc5 100644
--- a/lambdas/tests/unit/enums/test_supported_document_types.py
+++ b/lambdas/tests/unit/enums/test_supported_document_types.py
@@ -1,6 +1,11 @@
import pytest
from enums.supported_document_types import SupportedDocumentTypes
-from tests.unit.conftest import MOCK_ARF_TABLE_NAME, MOCK_LG_TABLE_NAME
+from tests.unit.conftest import (
+ MOCK_ARF_BUCKET,
+ MOCK_ARF_TABLE_NAME,
+ MOCK_LG_BUCKET,
+ MOCK_LG_TABLE_NAME,
+)
@pytest.mark.parametrize(
@@ -15,3 +20,17 @@ def test_get_dynamodb_table_name_return_table_name(set_env, doc_type, expected):
actual = doc_type_enum.get_dynamodb_table_name()
assert actual == expected
+
+
+@pytest.mark.parametrize(
+ ["doc_type", "expected"],
+ [
+ (SupportedDocumentTypes.ARF, MOCK_ARF_BUCKET),
+ (SupportedDocumentTypes.LG, MOCK_LG_BUCKET),
+ ],
+)
+def test_get_s3_bucket_name_return_bucket_name(set_env, doc_type, expected):
+ doc_type_enum = SupportedDocumentTypes(doc_type)
+ actual = doc_type_enum.get_s3_bucket_name()
+
+ assert actual == expected
diff --git a/lambdas/tests/unit/handlers/test_data_collection_handler.py b/lambdas/tests/unit/handlers/test_data_collection_handler.py
new file mode 100644
index 000000000..fb6e5aa6f
--- /dev/null
+++ b/lambdas/tests/unit/handlers/test_data_collection_handler.py
@@ -0,0 +1,22 @@
+import pytest
+from handlers.data_collection_handler import lambda_handler
+from services.data_collection_service import DataCollectionService
+from utils.exceptions import MissingEnvVarException
+
+
+def test_lambda_handler_call_underlying_service(mocker, context, set_env):
+ mock_data_collection_service = mocker.patch(
+ "handlers.data_collection_handler.DataCollectionService",
+ spec=DataCollectionService,
+ ).return_value
+
+ lambda_handler(None, context)
+
+ mock_data_collection_service.collect_all_data_and_write_to_dynamodb.assert_called_once()
+
+
+def test_lambda_handler_check_for_env_vars(context):
+ with pytest.raises(MissingEnvVarException) as error:
+ lambda_handler(None, context)
+
+ assert "An error occurred due to missing environment variable" in str(error.value)
diff --git a/lambdas/tests/unit/handlers/test_statistical_report_handler.py b/lambdas/tests/unit/handlers/test_statistical_report_handler.py
new file mode 100644
index 000000000..fa145e7fb
--- /dev/null
+++ b/lambdas/tests/unit/handlers/test_statistical_report_handler.py
@@ -0,0 +1,22 @@
+import pytest
+from handlers.statistical_report_handler import lambda_handler
+from services.statistical_report_service import StatisticalReportService
+from utils.exceptions import MissingEnvVarException
+
+
+def test_lambda_handler_call_underlying_service(mocker, context, set_env):
+ mock_statistical_report_service = mocker.patch(
+ "handlers.statistical_report_handler.StatisticalReportService",
+ spec=StatisticalReportService,
+ ).return_value
+
+ lambda_handler(None, context)
+
+ mock_statistical_report_service.make_weekly_summary_and_output_to_bucket.assert_called_once()
+
+
+def test_lambda_handler_check_for_env_vars(context):
+ with pytest.raises(MissingEnvVarException) as error:
+ lambda_handler(None, context)
+
+ assert "An error occurred due to missing environment variable" in str(error.value)
diff --git a/lambdas/tests/unit/helpers/data/dynamo_scan_response.py b/lambdas/tests/unit/helpers/data/dynamo_scan_response.py
index e39976e4a..962ba2484 100644
--- a/lambdas/tests/unit/helpers/data/dynamo_scan_response.py
+++ b/lambdas/tests/unit/helpers/data/dynamo_scan_response.py
@@ -1,5 +1,7 @@
from copy import deepcopy
+from utils.utilities import flatten
+
MOCK_RESPONSE = {
"Items": [
{
@@ -106,3 +108,78 @@
"RetryAttempts": 0,
},
}
+
+MOCK_PAGINATED_RESPONSE_1 = {
+ "Items": [
+ {
+ "FileName": "1of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ {
+ "FileName": "3of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ {
+ "FileName": "5of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ {
+ "FileName": "7of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ ],
+ "Count": 4,
+ "ScannedCount": 4,
+ "LastEvaluatedKey": {"ID": "id_token_for_page_2"},
+}
+
+
+MOCK_PAGINATED_RESPONSE_2 = {
+ "Items": [
+ {
+ "FileName": "2of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ {
+ "FileName": "10of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ {
+ "FileName": "8of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ {
+ "FileName": "6of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ ],
+ "Count": 4,
+ "ScannedCount": 4,
+ "LastEvaluatedKey": {"ID": "id_token_for_page_3"},
+}
+
+MOCK_PAGINATED_RESPONSE_3 = {
+ "Items": [
+ {
+ "FileName": "9of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ {
+ "FileName": "4of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf",
+ "VirusScannerResult": "Clean",
+ },
+ ],
+ "Count": 2,
+ "ScannedCount": 2,
+}
+
+EXPECTED_ITEMS_FOR_PAGINATED_RESULTS = flatten(
+ [
+ response["Items"]
+ for response in [
+ MOCK_PAGINATED_RESPONSE_1,
+ MOCK_PAGINATED_RESPONSE_2,
+ MOCK_PAGINATED_RESPONSE_3,
+ ]
+ ]
+)
diff --git a/lambdas/tests/unit/helpers/data/s3_responses.py b/lambdas/tests/unit/helpers/data/s3_responses.py
index 5a385de84..fb8150626 100644
--- a/lambdas/tests/unit/helpers/data/s3_responses.py
+++ b/lambdas/tests/unit/helpers/data/s3_responses.py
@@ -10,3 +10,75 @@
"x-amz-signature": "test-signature",
},
}
+
+MOCK_LIST_OBJECTS_RESPONSE = {
+ "ResponseMetadata": {
+ "RequestId": "abc",
+ "HostId": "xyz",
+ "HTTPStatusCode": 200,
+ "HTTPHeaders": {
+ "x-amz-id-2": "efg",
+ "x-amz-request-id": "HIJ",
+ "date": "Wed, 04 Jun 2024 13:49:45 GMT",
+ "x-amz-bucket-region": "eu-west-2",
+ "content-type": "application/xml",
+ "transfer-encoding": "chunked",
+ "server": "AmazonS3",
+ },
+ "RetryAttempts": 0,
+ },
+ "IsTruncated": False,
+ "Contents": [
+ {
+ "Key": "9000000009/2985a5dd-37ac-481a-b847-ee09e4b0817b",
+ "ETag": '"ddeafe0237ac7cb097c9a34c0e21a8a9"',
+ "Size": 928,
+ "StorageClass": "STANDARD",
+ },
+ {
+ "Key": "9000000009/384b886d-bd86-4211-9f43-73f7146fbb9b",
+ "ETag": '"ddeafe0237ac7cb097c9a34c0e21a8a9"',
+ "Size": 928,
+ "StorageClass": "STANDARD",
+ },
+ ],
+ "Name": "test-lg-bucket",
+ "Prefix": "",
+ "MaxKeys": 1000,
+ "EncodingType": "url",
+ "KeyCount": 2,
+}
+
+
+MOCK_LIST_OBJECTS_PAGINATED_RESPONSES = [
+ {
+ "IsTruncated": True,
+ "Contents": [
+ {
+ "Key": "9000000009/2985a5dd-37ac-481a-b847-ee09e4b0817b",
+ "Size": 928,
+ "StorageClass": "STANDARD",
+ },
+ {
+ "Key": "9000000009/384b886d-bd86-4211-9f43-73f7146fbb9b",
+ "Size": 928,
+ "StorageClass": "STANDARD",
+ },
+ ],
+ },
+ {
+ "IsTruncated": True,
+ "Contents": [
+ {
+ "Key": "9000000009/94c93a0c-a322-4eaa-ad0b-29ea876c33a5",
+ "Size": 928,
+ "StorageClass": "STANDARD",
+ },
+ {
+ "Key": "9000000009/36af7807-7965-4c17-b2eb-0f2ae903196d",
+ "Size": 928,
+ "StorageClass": "STANDARD",
+ },
+ ],
+ },
+]
diff --git a/lambdas/tests/unit/helpers/data/statistic/__init__.py b/lambdas/tests/unit/helpers/data/statistic/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_collected_data.py b/lambdas/tests/unit/helpers/data/statistic/mock_collected_data.py
new file mode 100644
index 000000000..0553358bf
--- /dev/null
+++ b/lambdas/tests/unit/helpers/data/statistic/mock_collected_data.py
@@ -0,0 +1,80 @@
+from datetime import datetime
+
+from models.statistics import ApplicationData, OrganisationData, RecordStoreData
+from tests.unit.helpers.data.statistic.mock_dynamodb_and_s3_records import (
+ TOTAL_FILE_SIZE_FOR_H81109,
+ TOTAL_FILE_SIZE_FOR_Y12345,
+)
+from tests.unit.helpers.data.statistic.mock_logs_query_results import (
+ HASHED_USER_ID_1,
+ HASHED_USER_ID_2,
+)
+
+TODAY_DATE = datetime.today().strftime("%Y%m%d")
+
+MOCK_RECORD_STORE_DATA = [
+ RecordStoreData(
+ statistic_id="mock_uuid",
+ date=TODAY_DATE,
+ ods_code="H81109",
+ total_number_of_records=6,
+ number_of_document_types=2,
+ total_size_of_records_in_megabytes=TOTAL_FILE_SIZE_FOR_H81109,
+ average_size_of_documents_per_patient_in_megabytes=TOTAL_FILE_SIZE_FOR_H81109
+ / 2,
+ ),
+ RecordStoreData(
+ statistic_id="mock_uuid",
+ date=TODAY_DATE,
+ ods_code="Y12345",
+ total_number_of_records=2,
+ number_of_document_types=2,
+ total_size_of_records_in_megabytes=TOTAL_FILE_SIZE_FOR_Y12345,
+ average_size_of_documents_per_patient_in_megabytes=TOTAL_FILE_SIZE_FOR_Y12345,
+ ),
+]
+
+MOCK_ORGANISATION_DATA = [
+ OrganisationData(
+ statistic_id="mock_uuid",
+ date=TODAY_DATE,
+ ods_code="H81109",
+ number_of_patients=2,
+ average_records_per_patient=3,
+ daily_count_stored=4,
+ daily_count_viewed=40,
+ daily_count_downloaded=20,
+ daily_count_deleted=2,
+ ),
+ OrganisationData(
+ statistic_id="mock_uuid",
+ date=TODAY_DATE,
+ ods_code="Y12345",
+ number_of_patients=1,
+ average_records_per_patient=2,
+ daily_count_stored=2,
+ daily_count_viewed=20,
+ daily_count_downloaded=10,
+ daily_count_deleted=1,
+ ),
+]
+
+MOCK_APPLICATION_DATA = [
+ ApplicationData(
+ statistic_id="mock_uuid",
+ date=TODAY_DATE,
+ ods_code="H81109",
+ active_user_ids_hashed=[HASHED_USER_ID_1, HASHED_USER_ID_2],
+ ),
+ ApplicationData(
+ statistic_id="mock_uuid",
+ date=TODAY_DATE,
+ ods_code="Y12345",
+ active_user_ids_hashed=[HASHED_USER_ID_1],
+ ),
+]
+
+ALL_MOCK_DATA = MOCK_RECORD_STORE_DATA + MOCK_ORGANISATION_DATA + MOCK_APPLICATION_DATA
+ALL_MOCK_DATA_AS_JSON_LIST = list(
+ map(lambda data: data.model_dump(by_alias=True), ALL_MOCK_DATA)
+)
diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_data_build_utils.py b/lambdas/tests/unit/helpers/data/statistic/mock_data_build_utils.py
new file mode 100644
index 000000000..cd5f38ded
--- /dev/null
+++ b/lambdas/tests/unit/helpers/data/statistic/mock_data_build_utils.py
@@ -0,0 +1,57 @@
+import random
+
+from models.statistics import ApplicationData, OrganisationData, RecordStoreData
+
+
+def make_random_data(
+ ods_code: str,
+ date_range: list[str],
+ field_names: set[str],
+) -> list[dict]:
+ results = []
+ for date in date_range:
+ random_data: dict[str, str | int] = {
+ key: random.randint(0, 1000) for key in field_names
+ }
+ random_data.update({"date": date, "ods_code": ods_code})
+ results.append(random_data)
+ return results
+
+
+def build_random_record_store_data(
+ ods_code: str, date_range: list[str]
+) -> list[RecordStoreData]:
+ field_names = set(RecordStoreData.model_fields.keys()) - {
+ "ods_code",
+ "date",
+ "statistic_id",
+ }
+ all_random_data = make_random_data(ods_code, date_range, field_names)
+ return [RecordStoreData(**data) for data in all_random_data]
+
+
+def build_random_organisation_data(
+ ods_code: str, date_range: list[str]
+) -> list[OrganisationData]:
+ field_names = set(OrganisationData.model_fields.keys()) - {
+ "ods_code",
+ "date",
+ "statistic_id",
+ }
+ all_random_data = make_random_data(ods_code, date_range, field_names)
+ return [OrganisationData(**data) for data in all_random_data]
+
+
+def build_random_application_data(
+ ods_code: str, date_range: list[str]
+) -> list[ApplicationData]:
+ result = []
+ for date in date_range:
+ random_numbers = random.sample(range(30), k=random.randint(1, 10))
+ hashed_user_ids = [f"userid_{number}" for number in random_numbers]
+ application_data = ApplicationData(
+ ods_code=ods_code, date=date, active_user_ids_hashed=hashed_user_ids
+ )
+ result.append(application_data)
+
+ return result
diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_dynamodb_and_s3_records.py b/lambdas/tests/unit/helpers/data/statistic/mock_dynamodb_and_s3_records.py
new file mode 100644
index 000000000..e1f22d551
--- /dev/null
+++ b/lambdas/tests/unit/helpers/data/statistic/mock_dynamodb_and_s3_records.py
@@ -0,0 +1,127 @@
+import math
+import random
+import uuid
+from typing import Tuple
+
+PDF_MIME_TYPE = "application/pdf"
+
+
+def build_mock_results(
+ ods_code: str,
+ nhs_number: str,
+ number_of_files: int = 3,
+ total_file_size_in_mb: int = 5,
+) -> Tuple[list[dict], list[dict]]:
+ file_ids = [uuid.uuid4() for _ in range(number_of_files)]
+ file_size_randoms = [random.randint(1, 10) for _ in range(number_of_files)]
+ total_file_size = total_file_size_in_mb * 1024 * 1024
+ file_sizes = [
+ math.floor(x * total_file_size / sum(file_size_randoms))
+ for x in file_size_randoms
+ ]
+ file_sizes[-1] = total_file_size - sum(file_sizes[:-1])
+
+ dynamodb_scan_result = [
+ {
+ "CurrentGpOds": ods_code,
+ "ContentType": PDF_MIME_TYPE,
+ "FileLocation": f"s3://test-lg-table/{nhs_number}/{file_id}",
+ "NhsNumber": nhs_number,
+ }
+ for file_id in file_ids
+ ]
+
+ s3_list_objects_result = [
+ {
+ "Key": f"{nhs_number}/{file_ids[index]}",
+ "Size": file_sizes[index],
+ }
+ for index in range(number_of_files)
+ ]
+
+ return dynamodb_scan_result, s3_list_objects_result
+
+
+MOCK_LG_SCAN_RESULT = [
+ {
+ "CurrentGpOds": "H81109",
+ "ContentType": PDF_MIME_TYPE,
+ "FileLocation": "s3://test-lg-table/9000000009/4ac21f7b-abd6-46c9-bf55-e7bafccba2ab",
+ "NhsNumber": "9000000009",
+ },
+ {
+ "CurrentGpOds": "H81109",
+ "ContentType": PDF_MIME_TYPE,
+ "FileLocation": "s3://test-lg-table/9000000009/95764e53-b5fd-47b8-8756-1c5604121444",
+ "NhsNumber": "9000000009",
+ },
+ {
+ "CurrentGpOds": "H81109",
+ "ContentType": PDF_MIME_TYPE,
+ "FileLocation": "s3://test-lg-table/9000000009/0ab18243-783a-4044-8146-b5b0996d8422",
+ "NhsNumber": "9000000009",
+ },
+ {
+ "CurrentGpOds": "H81109",
+ "ContentType": PDF_MIME_TYPE,
+ "FileLocation": "s3://test-lg-table/9000000001/5d5b3c28-e6c8-4d46-8ae3-a2b97321bcf8",
+ "NhsNumber": "9000000001",
+ },
+ {
+ "CurrentGpOds": "H81109",
+ "ContentType": PDF_MIME_TYPE,
+ "FileLocation": "s3://test-lg-table/9000000001/2543ba87-dcdb-4583-bae2-e83c4ba7af34",
+ "NhsNumber": "9000000001",
+ },
+]
+
+MOCK_ARF_SCAN_RESULT = [
+ {
+ "CurrentGpOds": "Y12345",
+ "ContentType": "application/msword",
+ "FileLocation": "s3://test-arf-table/9000000005/beec3523-1428-4c5f-b718-6ef25a4db1b9",
+ "NhsNumber": "9000000005",
+ },
+ {
+ "CurrentGpOds": "Y12345",
+ "ContentType": "image/jpeg",
+ "FileLocation": "s3://test-arf-table/9000000005/d2cf885b-9e78-4c29-bf5a-2a56e5e0df8f",
+ "NhsNumber": "9000000005",
+ },
+ {
+ "CurrentGpOds": "H81109",
+ "ContentType": "image/bmp",
+ "FileLocation": "s3://test-arf-table/9000000009/71e0c54e-5cfc-4260-a538-09eab185a6ed",
+ "NhsNumber": "9000000009",
+ },
+]
+
+MB = 1024 * 1024
+MOCK_LG_LIST_OBJECTS_RESULT = [
+ {"Key": "9000000009/4ac21f7b-abd6-46c9-bf55-e7bafccba2ab", "Size": 1 * MB},
+ {"Key": "9000000009/95764e53-b5fd-47b8-8756-1c5604121444", "Size": 2 * MB},
+ {"Key": "9000000009/0ab18243-783a-4044-8146-b5b0996d8422", "Size": 3 * MB},
+ {"Key": "9000000001/5d5b3c28-e6c8-4d46-8ae3-a2b97321bcf8", "Size": 4 * MB},
+ {"Key": "9000000001/2543ba87-dcdb-4583-bae2-e83c4ba7af34", "Size": 5 * MB},
+]
+MOCK_ARF_LIST_OBJECTS_RESULT = [
+ {
+ "Key": "9000000005/beec3523-1428-4c5f-b718-6ef25a4db1b9",
+ "Size": 1 * MB,
+ },
+ {
+ "Key": "9000000005/d2cf885b-9e78-4c29-bf5a-2a56e5e0df8f",
+ "Size": 2 * MB,
+ },
+ {
+ "Key": "9000000009/71e0c54e-5cfc-4260-a538-09eab185a6ed",
+ "Size": 6 * MB,
+ },
+]
+TOTAL_FILE_SIZE_FOR_9000000001 = 4 + 5
+TOTAL_FILE_SIZE_FOR_9000000009 = 1 + 2 + 3 + 6
+TOTAL_FILE_SIZE_FOR_9000000005 = 1 + 2
+TOTAL_FILE_SIZE_FOR_H81109 = (
+ TOTAL_FILE_SIZE_FOR_9000000001 + TOTAL_FILE_SIZE_FOR_9000000009
+)
+TOTAL_FILE_SIZE_FOR_Y12345 = TOTAL_FILE_SIZE_FOR_9000000005
diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_logs_query_results.py b/lambdas/tests/unit/helpers/data/statistic/mock_logs_query_results.py
new file mode 100644
index 000000000..8e5049b52
--- /dev/null
+++ b/lambdas/tests/unit/helpers/data/statistic/mock_logs_query_results.py
@@ -0,0 +1,98 @@
+USER_ID_1 = "F4A6AF98-4800-4A8A-A6C0-8FE0AC4B994B"
+USER_ID_2 = "9E7F1235-3DF1-4822-AFFB-C4FCC88C2690"
+HASHED_USER_ID_1 = "3192b6cf7ef953cf1a1f0945a83b55ab2cb8bae95cac6548ae5412aaa4c67677"
+HASHED_USER_ID_2 = "a89d1cb4ac0776e45131c65a69e8b1a48026e9b497c94409e480588418a016e4"
+
+MOCK_UNIQUE_ACTIVE_USER_IDS = [
+ {
+ "ods_code": "Y12345",
+ "user_id": USER_ID_1,
+ },
+ {
+ "ods_code": "H81109",
+ "user_id": USER_ID_1,
+ },
+ {
+ "ods_code": "H81109",
+ "user_id": USER_ID_2,
+ },
+]
+
+
+MOCK_LG_VIEWED = [
+ {
+ "ods_code": "Y12345",
+ "daily_count_viewed": "20",
+ },
+ {
+ "ods_code": "H81109",
+ "daily_count_viewed": "40",
+ },
+]
+
+MOCK_LG_DOWNLOADED = [
+ {
+ "ods_code": "Y12345",
+ "daily_count_downloaded": "10",
+ },
+ {
+ "ods_code": "H81109",
+ "daily_count_downloaded": "20",
+ },
+]
+
+MOCK_LG_DELETED = [
+ {
+ "ods_code": "Y12345",
+ "daily_count_deleted": "1",
+ },
+ {
+ "ods_code": "H81109",
+ "daily_count_deleted": "2",
+ },
+]
+
+MOCK_LG_STORED = [
+ {
+ "ods_code": "Y12345",
+ "daily_count_stored": "2",
+ },
+ {
+ "ods_code": "H81109",
+ "daily_count_stored": "4",
+ },
+]
+
+MOCK_RESPONSE_QUERY_IN_PROGRESS = {"status": "Running"}
+
+MOCK_RESPONSE_QUERY_FAILED = {"status": "Failed"}
+
+MOCK_RESPONSE_QUERY_COMPLETE = {
+ "results": [
+ [
+ {"field": "ods_code", "value": "Y12345"},
+ {"field": "daily_count_viewed", "value": "20"},
+ ],
+ [
+ {"field": "ods_code", "value": "H81109"},
+ {"field": "daily_count_viewed", "value": "40"},
+ ],
+ ],
+ "statistics": {
+ "recordsMatched": 123.0,
+ "recordsScanned": 123.0,
+ "bytesScanned": 123.0,
+ },
+ "status": "Complete",
+}
+
+EXPECTED_QUERY_RESULT = [
+ {
+ "ods_code": "Y12345",
+ "daily_count_viewed": "20",
+ },
+ {
+ "ods_code": "H81109",
+ "daily_count_viewed": "40",
+ },
+]
diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_statistic_data.py b/lambdas/tests/unit/helpers/data/statistic/mock_statistic_data.py
new file mode 100644
index 000000000..eaa91162a
--- /dev/null
+++ b/lambdas/tests/unit/helpers/data/statistic/mock_statistic_data.py
@@ -0,0 +1,298 @@
+from decimal import Decimal
+
+import polars as pl
+from models.statistics import ApplicationData, OrganisationData, RecordStoreData
+
+MOCK_RECORD_STORE_DATA_1 = RecordStoreData(
+ statistic_id="e02ec4db-8a7d-4f84-a4b3-875a526b37d4",
+ date="20240510",
+ ods_code="Z56789",
+ total_number_of_records=18,
+ number_of_document_types=1,
+ total_size_of_records_in_megabytes=Decimal("1.75"),
+ average_size_of_documents_per_patient_in_megabytes=Decimal("1.5"),
+)
+
+MOCK_RECORD_STORE_DATA_2 = RecordStoreData(
+ statistic_id="974b1ca0-8e5e-4d12-9673-93050f0fee71",
+ date="20240510",
+ ods_code="Y12345",
+ total_number_of_records=25,
+ number_of_document_types=1,
+ total_size_of_records_in_megabytes=Decimal("1.23"),
+ average_size_of_documents_per_patient_in_megabytes=Decimal("0.5"),
+)
+
+MOCK_RECORD_STORE_DATA_3 = RecordStoreData(
+ statistic_id="c2841ca0-8e5e-4d12-9673-93050f0fee71",
+ date="20240511",
+ ods_code="Y12345",
+ total_number_of_records=20,
+ number_of_document_types=2,
+ total_size_of_records_in_megabytes=Decimal("2.34"),
+ average_size_of_documents_per_patient_in_megabytes=Decimal("0.6"),
+)
+
+EXPECTED_SUMMARY_RECORD_STORE_DATA = pl.DataFrame(
+ [
+ {
+ "ods_code": "Z56789",
+ "total_number_of_records": 18,
+ "number_of_document_types": 1,
+ "total_size_of_records_in_megabytes": 1.75,
+ "average_size_of_documents_per_patient_in_megabytes": 1.5,
+ },
+ {
+ "ods_code": "Y12345",
+ "total_number_of_records": 20,
+ "number_of_document_types": 2,
+ "total_size_of_records_in_megabytes": 2.34,
+ "average_size_of_documents_per_patient_in_megabytes": 0.6,
+ },
+ ]
+)
+
+SERIALISED_RECORD_STORE_DATA = [
+ {
+ "TotalSizeOfRecordsInMegabytes": Decimal("1.75"),
+ "AverageSizeOfDocumentsPerPatientInMegabytes": Decimal("1.5"),
+ "Date": "20240510",
+ "TotalNumberOfRecords": 18,
+ "NumberOfDocumentTypes": 1,
+ "OdsCode": "Z56789",
+ "StatisticID": "RecordStoreData#e02ec4db-8a7d-4f84-a4b3-875a526b37d4",
+ },
+ {
+ "TotalSizeOfRecordsInMegabytes": Decimal("1.23"),
+ "AverageSizeOfDocumentsPerPatientInMegabytes": Decimal("0.5"),
+ "Date": "20240510",
+ "TotalNumberOfRecords": 25,
+ "NumberOfDocumentTypes": 1,
+ "OdsCode": "Y12345",
+ "StatisticID": "RecordStoreData#974b1ca0-8e5e-4d12-9673-93050f0fee71",
+ },
+ {
+ "TotalSizeOfRecordsInMegabytes": Decimal("2.34"),
+ "AverageSizeOfDocumentsPerPatientInMegabytes": Decimal("0.6"),
+ "Date": "20240511",
+ "TotalNumberOfRecords": 20,
+ "NumberOfDocumentTypes": 2,
+ "OdsCode": "Y12345",
+ "StatisticID": "RecordStoreData#c2841ca0-8e5e-4d12-9673-93050f0fee71",
+ },
+]
+
+MOCK_ORGANISATION_DATA_1 = OrganisationData(
+ statistic_id="5acda4bf-8b93-4ba0-8410-789aac4fcbae",
+ date="20240510",
+ ods_code="Z56789",
+ number_of_patients=4,
+ average_records_per_patient=Decimal("4.5"),
+ daily_count_stored=0,
+ daily_count_viewed=35,
+ daily_count_downloaded=4,
+ daily_count_deleted=1,
+)
+MOCK_ORGANISATION_DATA_2 = OrganisationData(
+ statistic_id="9ee2c3d1-97b9-4c34-b75c-83e7d1b442f4",
+ date="20240510",
+ ods_code="Y12345",
+ number_of_patients=9,
+ average_records_per_patient=Decimal("2.78"),
+ daily_count_stored=0,
+ daily_count_viewed=15,
+ daily_count_downloaded=1,
+ daily_count_deleted=1,
+)
+MOCK_ORGANISATION_DATA_3 = OrganisationData(
+ statistic_id="3f54cfe3-6c84-4bb2-b5b4-b786aa03b9c7",
+ date="20240511",
+ ods_code="Y12345",
+ number_of_patients=10,
+ average_records_per_patient=Decimal("3.51"),
+ daily_count_stored=2,
+ daily_count_viewed=30,
+ daily_count_downloaded=5,
+ daily_count_deleted=1,
+)
+
+EXPECTED_SUMMARY_ORGANISATION_DATA = pl.DataFrame(
+ [
+ {
+ "ods_code": "Z56789",
+ "weekly_count_stored": 0,
+ "weekly_count_viewed": 35,
+ "weekly_count_downloaded": 4,
+ "weekly_count_deleted": 1,
+ "average_records_per_patient": 4.5,
+ "number_of_patients": 4,
+ },
+ {
+ "ods_code": "Y12345",
+ "weekly_count_stored": 0 + 2,
+ "weekly_count_viewed": 15 + 30,
+ "weekly_count_downloaded": 1 + 5,
+ "weekly_count_deleted": 1 + 1,
+ "average_records_per_patient": (3.51 + 2.78) / 2,
+ "number_of_patients": 10,
+ },
+ ]
+)
+
+
+SERIALISED_ORGANISATION_DATA = [
+ {
+ "Date": "20240510",
+ "OdsCode": "Z56789",
+ "NumberOfPatients": 4,
+ "AverageRecordsPerPatient": Decimal("4.5"),
+ "DailyCountStored": 0,
+ "DailyCountViewed": 35,
+ "DailyCountDownloaded": 4,
+ "DailyCountDeleted": 1,
+ "StatisticID": "OrganisationData#5acda4bf-8b93-4ba0-8410-789aac4fcbae",
+ },
+ {
+ "Date": "20240510",
+ "OdsCode": "Y12345",
+ "NumberOfPatients": 9,
+ "AverageRecordsPerPatient": Decimal("2.78"),
+ "DailyCountStored": 0,
+ "DailyCountViewed": 15,
+ "DailyCountDownloaded": 1,
+ "DailyCountDeleted": 1,
+ "StatisticID": "OrganisationData#9ee2c3d1-97b9-4c34-b75c-83e7d1b442f4",
+ },
+ {
+ "Date": "20240511",
+ "OdsCode": "Y12345",
+ "NumberOfPatients": 10,
+ "AverageRecordsPerPatient": Decimal("3.51"),
+ "DailyCountStored": 2,
+ "DailyCountViewed": 30,
+ "DailyCountDownloaded": 5,
+ "DailyCountDeleted": 1,
+ "StatisticID": "OrganisationData#3f54cfe3-6c84-4bb2-b5b4-b786aa03b9c7",
+ },
+]
+
+MOCK_APPLICATION_DATA_1 = ApplicationData(
+ statistic_id="65ee0add-41ca-4b71-a6d2-63e309bed920",
+ date="20240510",
+ ods_code="Z56789",
+ active_user_ids_hashed=[
+ "zf1af742e351ce63d8ed275d4bec8d8f",
+ ],
+)
+MOCK_APPLICATION_DATA_2 = ApplicationData(
+ statistic_id="12d92f26-47c3-452c-923b-819cfcc27c79",
+ date="20240510",
+ ods_code="Y12345",
+ active_user_ids_hashed=[
+ "a873620d0b476b13ee571a28cc315870",
+ "ba81803adac3c816b6cbaf67bf33022a",
+ ],
+)
+MOCK_APPLICATION_DATA_3 = ApplicationData(
+ statistic_id="d495959f-93dc-4f05-a869-43d8711ca120",
+ date="20240511",
+ ods_code="Y12345",
+ active_user_ids_hashed=[
+ "a873620d0b476b13ee571a28cc315870",
+ "cf1af742e351ce63d8ed275d4bec8d8f",
+ ],
+)
+
+EXPECTED_SUMMARY_APPLICATION_DATA = pl.DataFrame(
+ [
+ {"ods_code": "Z56789", "active_users_count": 1},
+ {"ods_code": "Y12345", "active_users_count": 3},
+ ],
+)
+
+SERIALISED_APPLICATION_DATA = [
+ {
+ "Date": "20240510",
+ "OdsCode": "Z56789",
+ "StatisticID": "ApplicationData#65ee0add-41ca-4b71-a6d2-63e309bed920",
+ "ActiveUserIdsHashed": [
+ "zf1af742e351ce63d8ed275d4bec8d8f",
+ ],
+ },
+ {
+ "Date": "20240510",
+ "OdsCode": "Y12345",
+ "StatisticID": "ApplicationData#12d92f26-47c3-452c-923b-819cfcc27c79",
+ "ActiveUserIdsHashed": [
+ "a873620d0b476b13ee571a28cc315870",
+ "ba81803adac3c816b6cbaf67bf33022a",
+ ],
+ },
+ {
+ "Date": "20240511",
+ "OdsCode": "Y12345",
+ "StatisticID": "ApplicationData#d495959f-93dc-4f05-a869-43d8711ca120",
+ "ActiveUserIdsHashed": [
+ "a873620d0b476b13ee571a28cc315870",
+ "cf1af742e351ce63d8ed275d4bec8d8f",
+ ],
+ },
+]
+
+ALL_MOCKED_STATISTIC_DATA = (
+ [MOCK_RECORD_STORE_DATA_1, MOCK_RECORD_STORE_DATA_2, MOCK_RECORD_STORE_DATA_3],
+ [MOCK_ORGANISATION_DATA_1, MOCK_ORGANISATION_DATA_2, MOCK_ORGANISATION_DATA_3],
+ [MOCK_APPLICATION_DATA_1, MOCK_APPLICATION_DATA_2, MOCK_APPLICATION_DATA_3],
+)
+
+ALL_SUMMARY_DATA = [
+ EXPECTED_SUMMARY_RECORD_STORE_DATA,
+ EXPECTED_SUMMARY_ORGANISATION_DATA,
+ EXPECTED_SUMMARY_APPLICATION_DATA,
+]
+
+MOCK_DYNAMODB_ITEMS = (
+ SERIALISED_APPLICATION_DATA
+ + SERIALISED_ORGANISATION_DATA
+ + SERIALISED_RECORD_STORE_DATA
+)
+
+MOCK_DYNAMODB_QUERY_RESPONSE = [
+ {"Items": [item for item in MOCK_DYNAMODB_ITEMS if item["Date"] == "20240510"]},
+ {"Items": [item for item in MOCK_DYNAMODB_ITEMS if item["Date"] == "20240511"]},
+]
+
+EXPECTED_WEEKLY_SUMMARY = pl.DataFrame(
+ [
+ {
+ "Date": "20240505-20240511",
+ "ODS code": "Z56789",
+ "Active users count": 1,
+ "Average records per patient": 4.5,
+ "Average size of documents per patient in megabytes": 1.5,
+ "Number of document types": 1,
+ "Number of patients": 4,
+ "Total number of records": 18,
+ "Total size of records in megabytes": 1.75,
+ "Weekly count deleted": 1,
+ "Weekly count downloaded": 4,
+ "Weekly count stored": 0,
+ "Weekly count viewed": 35,
+ },
+ {
+ "Date": "20240505-20240511",
+ "ODS code": "Y12345",
+ "Active users count": 3,
+ "Average records per patient": (2.78 + 3.51) / 2,
+ "Average size of documents per patient in megabytes": 0.6,
+ "Number of document types": 2,
+ "Number of patients": 10,
+ "Total number of records": 20,
+ "Total size of records in megabytes": 2.34,
+ "Weekly count deleted": 1 + 1,
+ "Weekly count downloaded": 1 + 5,
+ "Weekly count stored": 0 + 2,
+ "Weekly count viewed": 15 + 30,
+ },
+ ]
+)
diff --git a/lambdas/tests/unit/models/test_statistics_models.py b/lambdas/tests/unit/models/test_statistics_models.py
new file mode 100644
index 000000000..b6a744448
--- /dev/null
+++ b/lambdas/tests/unit/models/test_statistics_models.py
@@ -0,0 +1,63 @@
+import pydantic
+import pytest
+from models.statistics import ApplicationData, RecordStoreData, load_from_dynamodb_items
+from tests.unit.helpers.data.statistic.mock_statistic_data import (
+ MOCK_APPLICATION_DATA_1,
+ MOCK_APPLICATION_DATA_2,
+ MOCK_APPLICATION_DATA_3,
+ MOCK_DYNAMODB_ITEMS,
+ MOCK_ORGANISATION_DATA_1,
+ MOCK_ORGANISATION_DATA_2,
+ MOCK_ORGANISATION_DATA_3,
+ MOCK_RECORD_STORE_DATA_1,
+ MOCK_RECORD_STORE_DATA_2,
+ MOCK_RECORD_STORE_DATA_3,
+ SERIALISED_RECORD_STORE_DATA,
+)
+
+
+def test_serialise_and_deserialise_record_store_data(mocker):
+ mocker.patch("uuid.uuid4", return_value="test_uuid")
+
+ test_data = MOCK_RECORD_STORE_DATA_1
+
+ output = test_data.model_dump(by_alias=True)
+ expected = SERIALISED_RECORD_STORE_DATA[0]
+ assert output == expected
+
+ load_from_deserialised = RecordStoreData.model_validate(output)
+ assert test_data == load_from_deserialised
+
+
+def test_empty_ods_code_will_be_filled_with_an_empty_value():
+ data = RecordStoreData(date="20240516", ods_code="")
+ assert data.ods_code == "NO_ODS_CODE"
+
+
+def test_validation_error_raised_when_try_to_deserialise_to_wrong_type():
+ test_data = SERIALISED_RECORD_STORE_DATA[0]
+
+ with pytest.raises(pydantic.ValidationError) as e:
+ ApplicationData.model_validate(test_data)
+
+ assert "StatisticID must be in the form of `ApplicationData#uuid" in str(e.value)
+
+
+def test_load_from_dynamodb_items():
+ deserialised_data = load_from_dynamodb_items(MOCK_DYNAMODB_ITEMS)
+
+ assert deserialised_data.record_store_data == [
+ MOCK_RECORD_STORE_DATA_1,
+ MOCK_RECORD_STORE_DATA_2,
+ MOCK_RECORD_STORE_DATA_3,
+ ]
+ assert deserialised_data.organisation_data == [
+ MOCK_ORGANISATION_DATA_1,
+ MOCK_ORGANISATION_DATA_2,
+ MOCK_ORGANISATION_DATA_3,
+ ]
+ assert deserialised_data.application_data == [
+ MOCK_APPLICATION_DATA_1,
+ MOCK_APPLICATION_DATA_2,
+ MOCK_APPLICATION_DATA_3,
+ ]
diff --git a/lambdas/tests/unit/services/base/test_cloudwatch_logs_query_service.py b/lambdas/tests/unit/services/base/test_cloudwatch_logs_query_service.py
new file mode 100644
index 000000000..ceed28490
--- /dev/null
+++ b/lambdas/tests/unit/services/base/test_cloudwatch_logs_query_service.py
@@ -0,0 +1,133 @@
+import pytest
+from services.base.cloudwatch_service import CloudwatchService
+from tests.unit.conftest import WORKSPACE
+from tests.unit.helpers.data.statistic.mock_logs_query_results import (
+ EXPECTED_QUERY_RESULT,
+ MOCK_RESPONSE_QUERY_COMPLETE,
+ MOCK_RESPONSE_QUERY_FAILED,
+ MOCK_RESPONSE_QUERY_IN_PROGRESS,
+)
+from utils.cloudwatch_logs_query import CloudwatchLogsQueryParams
+from utils.exceptions import LogsQueryException
+
+MOCK_QUERY_ID = "mock_query_id"
+MOCK_LAMBDA_NAME = "mock-lambda"
+MOCK_QUERY_STRING = """
+ fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code
+ | filter Message = 'User has viewed Lloyd George records'
+ | stats count() AS daily_count_viewed BY ods_code
+ """
+MOCK_QUERY_PARAMS = CloudwatchLogsQueryParams(MOCK_LAMBDA_NAME, MOCK_QUERY_STRING)
+MOCK_START_TIME = 1717667304
+MOCK_END_TIME = 171777304
+
+
+@pytest.fixture
+def mock_service(set_env, mock_logs_client, patch_sleep):
+ service = CloudwatchService()
+ yield service
+
+
+@pytest.fixture
+def patch_sleep(mocker):
+ mocker.patch("time.sleep")
+
+
+@pytest.fixture
+def mock_logs_client(mocker):
+ mock_instance = mocker.patch("boto3.client").return_value
+ yield mock_instance
+
+
+def test_query_logs(mock_logs_client, mock_service):
+ mock_logs_client.start_query.return_value = {"queryId": MOCK_QUERY_ID}
+ mock_logs_client.get_query_results.side_effect = [
+ MOCK_RESPONSE_QUERY_IN_PROGRESS,
+ MOCK_RESPONSE_QUERY_IN_PROGRESS,
+ MOCK_RESPONSE_QUERY_COMPLETE,
+ ]
+ expected = EXPECTED_QUERY_RESULT
+ expected_start_query_call = {
+ "logGroupName": f"/aws/lambda/{WORKSPACE}_{MOCK_QUERY_PARAMS.lambda_name}",
+ "startTime": MOCK_START_TIME,
+ "endTime": MOCK_END_TIME,
+ "queryString": MOCK_QUERY_PARAMS.query_string,
+ }
+
+ actual = mock_service.query_logs(
+ query_params=MOCK_QUERY_PARAMS,
+ start_time=MOCK_START_TIME,
+ end_time=MOCK_END_TIME,
+ )
+
+ assert actual == expected
+
+ mock_logs_client.start_query.assert_called_with(**expected_start_query_call)
+ mock_logs_client.get_query_results.assert_called_with(queryId=MOCK_QUERY_ID)
+
+
+def test_poll_query_result_poll_result_until_complete(mock_logs_client, mock_service):
+ mock_logs_client.get_query_results.side_effect = [
+ MOCK_RESPONSE_QUERY_IN_PROGRESS,
+ MOCK_RESPONSE_QUERY_IN_PROGRESS,
+ MOCK_RESPONSE_QUERY_COMPLETE,
+ ]
+
+ actual = mock_service.poll_query_result(MOCK_QUERY_ID)
+ expected = MOCK_RESPONSE_QUERY_COMPLETE["results"]
+
+ assert actual == expected
+
+ mock_logs_client.get_query_results.assert_called_with(queryId=MOCK_QUERY_ID)
+ assert mock_logs_client.get_query_results.call_count == 3
+
+
+def test_poll_query_result_raise_error_when_exceed_max_retries(
+ mock_logs_client, mock_service
+):
+ mock_logs_client.get_query_results.return_value = MOCK_RESPONSE_QUERY_IN_PROGRESS
+
+ with pytest.raises(LogsQueryException):
+ mock_service.poll_query_result(query_id=MOCK_QUERY_ID, max_retries=20)
+ assert mock_logs_client.get_query_results.call_count == 20
+
+
+def test_poll_query_result_raise_error_when_query_failed(
+ mock_logs_client, mock_service
+):
+ mock_logs_client.get_query_results.side_effect = [
+ MOCK_RESPONSE_QUERY_IN_PROGRESS,
+ MOCK_RESPONSE_QUERY_IN_PROGRESS,
+ MOCK_RESPONSE_QUERY_FAILED,
+ ]
+
+ with pytest.raises(LogsQueryException):
+ mock_service.poll_query_result(MOCK_QUERY_ID)
+ assert mock_logs_client.get_query_results.call_count == 3
+
+
+def test_regroup_raw_query_result(mock_service):
+ raw_query_result = [
+ [
+ {"field": "ods_code", "value": "Y12345"},
+ {"field": "daily_count_viewed", "value": "20"},
+ ],
+ [
+ {"field": "ods_code", "value": "H81109"},
+ {"field": "daily_count_viewed", "value": "40"},
+ ],
+ ]
+ expected = [
+ {
+ "ods_code": "Y12345",
+ "daily_count_viewed": "20",
+ },
+ {
+ "ods_code": "H81109",
+ "daily_count_viewed": "40",
+ },
+ ]
+
+ actual = mock_service.regroup_raw_query_result(raw_query_result)
+
+ assert actual == expected
diff --git a/lambdas/tests/unit/services/base/test_dynamo_service.py b/lambdas/tests/unit/services/base/test_dynamo_service.py
index 1cca255cc..3ac940e64 100755
--- a/lambdas/tests/unit/services/base/test_dynamo_service.py
+++ b/lambdas/tests/unit/services/base/test_dynamo_service.py
@@ -1,3 +1,6 @@
+from typing import Optional
+from unittest.mock import call
+
import pytest
from boto3.dynamodb.conditions import Attr, Key
from botocore.exceptions import ClientError
@@ -6,6 +9,13 @@
from services.base.dynamo_service import DynamoDBService
from tests.unit.conftest import MOCK_TABLE_NAME, TEST_NHS_NUMBER
from tests.unit.helpers.data.dynamo_responses import MOCK_SEARCH_RESPONSE
+from tests.unit.helpers.data.dynamo_scan_response import (
+ EXPECTED_ITEMS_FOR_PAGINATED_RESULTS,
+ MOCK_PAGINATED_RESPONSE_1,
+ MOCK_PAGINATED_RESPONSE_2,
+ MOCK_PAGINATED_RESPONSE_3,
+ MOCK_RESPONSE,
+)
from utils.dynamo_query_filter_builder import DynamoQueryFilterBuilder
from utils.exceptions import DynamoServiceException
@@ -32,6 +42,13 @@ def mock_table(mocker, mock_service):
yield mocker.patch.object(mock_service, "get_table")
+@pytest.fixture
+def mock_scan_method(mock_table):
+ table_instance = mock_table.return_value
+ scan_method = table_instance.scan
+ yield scan_method
+
+
@pytest.fixture
def mock_filter_expression():
filter_builder = DynamoQueryFilterBuilder()
@@ -163,13 +180,13 @@ def test_query_with_requested_fields_client_error_raises_exception(
assert expected_response == actual_response.value
-def test_simple_query_is_called_with_correct_parameters(mock_service, mock_table):
+def test_query_all_fields_is_called_with_correct_parameters(mock_service, mock_table):
mock_table.return_value.query.return_value = {
"Items": [{"id": "fake_test_item"}],
"Counts": 1,
}
- mock_service.simple_query(MOCK_TABLE_NAME, "test_key_condition_expression")
+ mock_service.query_all_fields(MOCK_TABLE_NAME, "test_key_condition_expression")
mock_table.assert_called_with(MOCK_TABLE_NAME)
mock_table.return_value.query.assert_called_once_with(
@@ -177,11 +194,13 @@ def test_simple_query_is_called_with_correct_parameters(mock_service, mock_table
)
-def test_simple_query_raises_exception_when_results_are_empty(mock_service, mock_table):
+def test_query_all_fields_raises_exception_when_results_are_empty(
+ mock_service, mock_table
+):
mock_table.return_value.query.return_value = []
with pytest.raises(DynamoServiceException):
- mock_service.simple_query(MOCK_TABLE_NAME, "test_key_condition_expression")
+ mock_service.query_all_fields(MOCK_TABLE_NAME, "test_key_condition_expression")
mock_table.assert_called_with(MOCK_TABLE_NAME)
mock_table.return_value.query.assert_called_once_with(
@@ -189,12 +208,12 @@ def test_simple_query_raises_exception_when_results_are_empty(mock_service, mock
)
-def test_simple_query_client_error_raises_exception(mock_service, mock_table):
+def test_query_all_fields_client_error_raises_exception(mock_service, mock_table):
expected_response = MOCK_CLIENT_ERROR
mock_table.return_value.query.side_effect = MOCK_CLIENT_ERROR
with pytest.raises(ClientError) as actual_response:
- mock_service.simple_query(MOCK_TABLE_NAME, "test_key_condition_expression")
+ mock_service.query_all_fields(MOCK_TABLE_NAME, "test_key_condition_expression")
assert expected_response == actual_response.value
@@ -346,6 +365,85 @@ def test_scan_table_client_error_raises_exception(mock_service, mock_table):
assert expected_response == actual_response.value
+def test_scan_whole_table_return_items_in_response(
+ mock_service, mock_scan_method, mock_filter_expression
+):
+ mock_project_expression = "mock_project_expression"
+ mock_scan_method.return_value = MOCK_RESPONSE
+
+ expected = MOCK_RESPONSE["Items"]
+ actual = mock_service.scan_whole_table(
+ table_name=MOCK_TABLE_NAME,
+ project_expression=mock_project_expression,
+ filter_expression=mock_filter_expression,
+ )
+
+ assert expected == actual
+
+ mock_service.get_table.assert_called_with(MOCK_TABLE_NAME)
+ mock_scan_method.assert_called_with(
+ ProjectionExpression=mock_project_expression,
+ FilterExpression=mock_filter_expression,
+ )
+
+
+def test_scan_whole_table_handles_pagination(
+ mock_service, mock_scan_method, mock_filter_expression
+):
+ def mock_scan_implementation(
+ ExclusiveStartKey: Optional[dict[str, str]] = None, **_kwargs
+ ):
+ if not ExclusiveStartKey:
+ return MOCK_PAGINATED_RESPONSE_1
+ elif ExclusiveStartKey.get("ID") == "id_token_for_page_2":
+ return MOCK_PAGINATED_RESPONSE_2
+ elif ExclusiveStartKey.get("ID") == "id_token_for_page_3":
+ return MOCK_PAGINATED_RESPONSE_3
+
+ mock_project_expression = "mock_project_expression"
+ mock_scan_method.side_effect = mock_scan_implementation
+
+ expected_result = EXPECTED_ITEMS_FOR_PAGINATED_RESULTS
+ expected_calls = [
+ call(
+ ProjectionExpression=mock_project_expression,
+ FilterExpression=mock_filter_expression,
+ ),
+ call(
+ ProjectionExpression=mock_project_expression,
+ FilterExpression=mock_filter_expression,
+ ExclusiveStartKey={"ID": "id_token_for_page_2"},
+ ),
+ call(
+ ProjectionExpression=mock_project_expression,
+ FilterExpression=mock_filter_expression,
+ ExclusiveStartKey={"ID": "id_token_for_page_3"},
+ ),
+ ]
+
+ actual = mock_service.scan_whole_table(
+ table_name=MOCK_TABLE_NAME,
+ project_expression=mock_project_expression,
+ filter_expression=mock_filter_expression,
+ )
+
+ assert expected_result == actual
+
+ mock_service.get_table.assert_called_with(MOCK_TABLE_NAME)
+ mock_scan_method.assert_has_calls(expected_calls)
+
+
+def test_scan_whole_table_omit_expression_arguments_if_not_given(
+ mock_service, mock_scan_method
+):
+ mock_service.scan_whole_table(
+ table_name=MOCK_TABLE_NAME,
+ )
+
+ mock_service.get_table.assert_called_with(MOCK_TABLE_NAME)
+ mock_scan_method.assert_called_with()
+
+
def test_get_table_when_table_exists_then_table_is_returned_successfully(
mock_service, mock_dynamo_service
):
diff --git a/lambdas/tests/unit/services/base/test_s3_service.py b/lambdas/tests/unit/services/base/test_s3_service.py
index 63a342657..9b4d3fb05 100755
--- a/lambdas/tests/unit/services/base/test_s3_service.py
+++ b/lambdas/tests/unit/services/base/test_s3_service.py
@@ -8,8 +8,13 @@
TEST_NHS_NUMBER,
TEST_OBJECT_KEY,
)
-from tests.unit.helpers.data.s3_responses import MOCK_PRESIGNED_URL_RESPONSE
+from tests.unit.helpers.data.s3_responses import (
+ MOCK_LIST_OBJECTS_PAGINATED_RESPONSES,
+ MOCK_LIST_OBJECTS_RESPONSE,
+ MOCK_PRESIGNED_URL_RESPONSE,
+)
from utils.exceptions import TagNotFoundException
+from utils.utilities import flatten
TEST_DOWNLOAD_PATH = "test_path"
MOCK_EVENT_BODY = {
@@ -26,6 +31,7 @@ def mock_service(mocker, set_env):
mocker.patch("services.base.iam_service.IAMService")
service = S3Service(custom_aws_role="mock_arn_custom_role")
yield service
+ S3Service._instance = None
@pytest.fixture
@@ -40,6 +46,12 @@ def mock_custom_client(mocker, mock_service):
yield client
+@pytest.fixture
+def mock_list_objects_paginate(mock_client):
+ mock_paginator_method = mock_client.get_paginator.return_value.paginate
+ return mock_paginator_method
+
+
def test_create_upload_presigned_url(mock_service, mock_custom_client):
mock_custom_client.generate_presigned_post.return_value = (
MOCK_PRESIGNED_URL_RESPONSE
@@ -248,7 +260,6 @@ def test_file_exist_on_s3_raises_client_error_if_unexpected_response(
def test_s3_service_singleton_instance(mocker):
mocker.patch("boto3.client")
- S3Service._instance = None
instance_1 = S3Service()
instance_2 = S3Service()
@@ -257,8 +268,6 @@ def test_s3_service_singleton_instance(mocker):
def test_not_created_presigned_url_without_custom_client(mocker):
- S3Service._instance = None
-
mocker.patch("boto3.client")
mock_service = S3Service()
@@ -268,8 +277,6 @@ def test_not_created_presigned_url_without_custom_client(mocker):
def test_not_created_custom_client_without_client_role(mocker):
- S3Service._instance = None
-
mocker.patch("boto3.client")
iam_service = mocker.patch("services.base.iam_service.IAMService")
@@ -296,3 +303,45 @@ def test_created_custom_client_when_client_role_is_passed(mocker):
iam_service.assert_called()
assert mock_service.custom_client == custom_client_mock
iam_service_instance.assume_role.assert_called()
+
+
+def test_list_all_objects_return_a_list_of_file_details(
+ mock_service, mock_client, mock_list_objects_paginate
+):
+ mock_list_objects_paginate.return_value = [MOCK_LIST_OBJECTS_RESPONSE]
+ expected = MOCK_LIST_OBJECTS_RESPONSE["Contents"]
+
+ actual = mock_service.list_all_objects(MOCK_BUCKET)
+
+ assert actual == expected
+
+ mock_client.get_paginator.assert_called_with("list_objects_v2")
+ mock_list_objects_paginate.assert_called_with(Bucket=MOCK_BUCKET)
+
+
+def test_list_all_objects_handles_paginated_responses(
+ mock_service, mock_client, mock_list_objects_paginate
+):
+ mock_list_objects_paginate.return_value = MOCK_LIST_OBJECTS_PAGINATED_RESPONSES
+
+ expected = flatten(
+ [page["Contents"] for page in MOCK_LIST_OBJECTS_PAGINATED_RESPONSES]
+ )
+
+ actual = mock_service.list_all_objects(MOCK_BUCKET)
+
+ assert actual == expected
+
+
+def test_list_all_objects_raises_client_error_if_unexpected_response(
+ mock_service, mock_client, mock_list_objects_paginate
+):
+ mock_error = ClientError(
+ {"Error": {"Code": "500", "Message": "Internal Server Error"}},
+ "S3:ListObjectsV2",
+ )
+
+ mock_list_objects_paginate.side_effect = mock_error
+
+ with pytest.raises(ClientError):
+ mock_service.list_all_objects(MOCK_BUCKET)
diff --git a/lambdas/tests/unit/services/test_authoriser_service.py b/lambdas/tests/unit/services/test_authoriser_service.py
index 0f953ce08..0a0bbc864 100644
--- a/lambdas/tests/unit/services/test_authoriser_service.py
+++ b/lambdas/tests/unit/services/test_authoriser_service.py
@@ -44,7 +44,7 @@ def mock_dynamo_service(mocker):
}
],
}
- dynamo_service = mocker.patch.object(DynamoDBService, "simple_query")
+ dynamo_service = mocker.patch.object(DynamoDBService, "query_all_fields")
dynamo_service.return_value = valid_session_record
yield dynamo_service
@@ -142,7 +142,7 @@ def test_find_login_session_raises_auth_exception(
invalid_session_record = {
"Count": 1,
}
- dynamo_service = mocker.patch.object(DynamoDBService, "simple_query")
+ dynamo_service = mocker.patch.object(DynamoDBService, "query_all_fields")
dynamo_service.return_value = invalid_session_record
with pytest.raises(AuthorisationException):
diff --git a/lambdas/tests/unit/services/test_data_collection_service.py b/lambdas/tests/unit/services/test_data_collection_service.py
new file mode 100644
index 000000000..044d00b1a
--- /dev/null
+++ b/lambdas/tests/unit/services/test_data_collection_service.py
@@ -0,0 +1,434 @@
+from datetime import datetime
+from decimal import Decimal
+from random import shuffle
+from unittest.mock import call
+
+import pytest
+from freezegun import freeze_time
+from pytest_unordered import unordered
+from services.base.cloudwatch_service import CloudwatchService
+from services.base.dynamo_service import DynamoDBService
+from services.base.s3_service import S3Service
+from services.data_collection_service import DataCollectionService
+from tests.unit.conftest import (
+ MOCK_ARF_BUCKET,
+ MOCK_ARF_TABLE_NAME,
+ MOCK_LG_BUCKET,
+ MOCK_LG_TABLE_NAME,
+ MOCK_STATISTICS_TABLE,
+)
+from tests.unit.helpers.data.statistic.mock_collected_data import (
+ ALL_MOCK_DATA_AS_JSON_LIST,
+ MOCK_APPLICATION_DATA,
+ MOCK_ORGANISATION_DATA,
+ MOCK_RECORD_STORE_DATA,
+)
+from tests.unit.helpers.data.statistic.mock_dynamodb_and_s3_records import (
+ MOCK_ARF_LIST_OBJECTS_RESULT,
+ MOCK_ARF_SCAN_RESULT,
+ MOCK_LG_LIST_OBJECTS_RESULT,
+ MOCK_LG_SCAN_RESULT,
+ TOTAL_FILE_SIZE_FOR_H81109,
+ TOTAL_FILE_SIZE_FOR_Y12345,
+ build_mock_results,
+)
+from tests.unit.helpers.data.statistic.mock_logs_query_results import (
+ HASHED_USER_ID_1,
+ HASHED_USER_ID_2,
+ MOCK_LG_DELETED,
+ MOCK_LG_DOWNLOADED,
+ MOCK_LG_STORED,
+ MOCK_LG_VIEWED,
+ MOCK_UNIQUE_ACTIVE_USER_IDS,
+)
+from utils.cloudwatch_logs_query import (
+ CloudwatchLogsQueryParams,
+ LloydGeorgeRecordsDeleted,
+ LloydGeorgeRecordsDownloaded,
+ LloydGeorgeRecordsStored,
+ LloydGeorgeRecordsViewed,
+ UniqueActiveUserIds,
+)
+from utils.common_query_filters import UploadCompleted
+
+
+@pytest.fixture
+def mock_dynamo_service(mocker):
+ def mock_implementation(table_name, **_kwargs):
+ if table_name == MOCK_LG_TABLE_NAME:
+ return MOCK_LG_SCAN_RESULT
+ elif table_name == MOCK_ARF_TABLE_NAME:
+ return MOCK_ARF_SCAN_RESULT
+
+ patched_instance = mocker.patch(
+ "services.data_collection_service.DynamoDBService", spec=DynamoDBService
+ ).return_value
+ patched_method = patched_instance.scan_whole_table
+ patched_method.side_effect = mock_implementation
+
+ yield patched_instance
+
+
+@pytest.fixture
+def mock_s3_list_all_objects(mocker):
+ def mock_implementation(bucket_name, **_kwargs):
+ if bucket_name == MOCK_LG_BUCKET:
+ return MOCK_LG_LIST_OBJECTS_RESULT
+ elif bucket_name == MOCK_ARF_BUCKET:
+ return MOCK_ARF_LIST_OBJECTS_RESULT
+
+ patched_instance = mocker.patch(
+ "services.data_collection_service.S3Service", spec=S3Service
+ ).return_value
+ patched_method = patched_instance.list_all_objects
+ patched_method.side_effect = mock_implementation
+
+ yield patched_method
+
+
+@pytest.fixture
+def mock_query_logs(mocker):
+ def mock_implementation(query_params: CloudwatchLogsQueryParams, **_kwargs):
+ if query_params == LloydGeorgeRecordsViewed:
+ return MOCK_LG_VIEWED
+ elif query_params == LloydGeorgeRecordsDownloaded:
+ return MOCK_LG_DOWNLOADED
+ elif query_params == LloydGeorgeRecordsDeleted:
+ return MOCK_LG_DELETED
+ elif query_params == LloydGeorgeRecordsStored:
+ return MOCK_LG_STORED
+ elif query_params == UniqueActiveUserIds:
+ return MOCK_UNIQUE_ACTIVE_USER_IDS
+
+ patched_instance = mocker.patch(
+ "services.data_collection_service.CloudwatchService",
+ spec=CloudwatchService,
+ ).return_value
+ mocked_method = patched_instance.query_logs
+ mocked_method.side_effect = mock_implementation
+
+ yield mocked_method
+
+
+@pytest.fixture
+def mock_service(
+ set_env, mock_query_logs, mock_dynamo_service, mock_s3_list_all_objects
+):
+ service = DataCollectionService()
+ yield service
+
+
+@pytest.fixture
+def mock_uuid(mocker):
+ yield mocker.patch("uuid.uuid4", return_value="mock_uuid")
+
+
+@pytest.fixture
+def larger_mock_data():
+ dynamodb_1, s3_1 = build_mock_results("H81109", "9000000001", 135, 123)
+ dynamodb_2, s3_2 = build_mock_results("H81109", "9000000002", 246, 456)
+ dynamodb_3, s3_3 = build_mock_results("H81109", "9000000003", 369, 789)
+ dynamodb_4, s3_4 = build_mock_results("Y12345", "9000000004", 4812, 9876)
+ dynamodb_5, s3_5 = build_mock_results("Y12345", "9000000005", 5101, 5432)
+
+ mock_dynamo_scan_result = (
+ dynamodb_1 + dynamodb_2 + dynamodb_3 + dynamodb_4 + dynamodb_5
+ )
+ mock_s3_list_objects_result = s3_1 + s3_2 + s3_3 + s3_4 + s3_5
+ shuffle(mock_dynamo_scan_result)
+ shuffle(mock_s3_list_objects_result)
+
+ return mock_dynamo_scan_result, mock_s3_list_objects_result
+
+
+@freeze_time("2024-06-04T18:00:00Z")
+def test_datetime_correctly_configured_during_initialise(set_env):
+ service = DataCollectionService()
+
+ assert service.today_date == "20240604"
+ assert (
+ service.collection_start_time
+ == datetime.fromisoformat("2024-06-03T18:00:00Z").timestamp()
+ )
+ assert (
+ service.collection_end_time
+ == datetime.fromisoformat("2024-06-04T18:00:00Z").timestamp()
+ )
+
+
+def test_collect_all_data_and_write_to_dynamodb(mock_service, mocker):
+ mock_collected_data = ["testing1234"]
+ mock_service.collect_all_data = mocker.MagicMock(return_value=mock_collected_data)
+ mock_service.write_to_dynamodb_table = mocker.MagicMock()
+
+ mock_service.collect_all_data_and_write_to_dynamodb()
+
+ mock_service.collect_all_data.assert_called_once()
+ mock_service.write_to_dynamodb_table.assert_called_with(mock_collected_data)
+
+
+def test_collect_all_data(mock_service, mock_uuid):
+ actual = mock_service.collect_all_data()
+ expected = unordered(
+ MOCK_RECORD_STORE_DATA + MOCK_ORGANISATION_DATA + MOCK_APPLICATION_DATA
+ )
+
+ assert actual == expected
+
+
+def test_write_to_dynamodb_table(mock_dynamo_service, mock_service):
+ mock_data = MOCK_RECORD_STORE_DATA + MOCK_ORGANISATION_DATA + MOCK_APPLICATION_DATA
+ mock_service.write_to_dynamodb_table(mock_data)
+
+ mock_dynamo_service.batch_writing.assert_called_with(
+ table_name=MOCK_STATISTICS_TABLE, item_list=ALL_MOCK_DATA_AS_JSON_LIST
+ )
+
+
+def test_scan_dynamodb_tables(mock_dynamo_service, mock_service):
+ mock_service.scan_dynamodb_tables()
+
+ expected_project_expression = "CurrentGpOds,NhsNumber,FileLocation,ContentType"
+ expected_filter_expression = UploadCompleted
+
+ expected_calls = [
+ call(
+ table_name=MOCK_ARF_TABLE_NAME,
+ project_expression=expected_project_expression,
+ filter_expression=expected_filter_expression,
+ ),
+ call(
+ table_name=MOCK_LG_TABLE_NAME,
+ project_expression=expected_project_expression,
+ filter_expression=expected_filter_expression,
+ ),
+ ]
+ mock_dynamo_service.scan_whole_table.assert_has_calls(expected_calls)
+
+
+def test_get_all_s3_files_info(mock_s3_list_all_objects, mock_service):
+ mock_service.get_all_s3_files_info()
+
+ expected_calls = [
+ call(MOCK_ARF_BUCKET),
+ call(MOCK_LG_BUCKET),
+ ]
+
+ mock_s3_list_all_objects.assert_has_calls(expected_calls)
+
+
+def test_get_record_store_data(mock_service, mock_uuid):
+ mock_dynamo_scan_result = MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT
+ s3_list_objects_result = MOCK_ARF_LIST_OBJECTS_RESULT + MOCK_LG_LIST_OBJECTS_RESULT
+
+ actual = mock_service.get_record_store_data(
+ mock_dynamo_scan_result, s3_list_objects_result
+ )
+ expected = unordered(MOCK_RECORD_STORE_DATA)
+
+ assert actual == expected
+
+
+def test_get_organisation_data(mock_service, mock_uuid):
+ mock_dynamo_scan_result = MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT
+
+ actual = mock_service.get_organisation_data(mock_dynamo_scan_result)
+ expected = unordered(MOCK_ORGANISATION_DATA)
+
+ assert actual == expected
+
+
+def test_get_application_data(mock_service, mock_uuid):
+ actual = mock_service.get_application_data()
+ expected = unordered(MOCK_APPLICATION_DATA)
+
+ assert actual == expected
+
+
+def test_get_active_user_list(set_env, mock_query_logs):
+ mock_query_logs.return_value = MOCK_UNIQUE_ACTIVE_USER_IDS
+ service = DataCollectionService()
+ expected = {
+ "H81109": [
+ HASHED_USER_ID_1,
+ HASHED_USER_ID_2,
+ ],
+ "Y12345": [HASHED_USER_ID_1],
+ }
+ actual = service.get_active_user_list()
+
+ assert actual == expected
+
+
+@freeze_time("2024-06-04T10:25:00")
+def test_get_cloud_watch_query_result(set_env, mock_query_logs):
+ mock_query_param = CloudwatchLogsQueryParams("mock", "test")
+ service = DataCollectionService()
+ expected_start_time = datetime.fromisoformat("2024-06-03T10:25:00").timestamp()
+ expected_end_time = datetime.fromisoformat("2024-06-04T10:25:00").timestamp()
+
+ service.get_cloud_watch_query_result(mock_query_param)
+
+ mock_query_logs.assert_called_with(
+ query_params=mock_query_param,
+ start_time=expected_start_time,
+ end_time=expected_end_time,
+ )
+
+
+def test_get_total_number_of_records(mock_service):
+ actual = mock_service.get_total_number_of_records(
+ MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT
+ )
+ expected = [
+ {"ods_code": "Y12345", "total_number_of_records": 2},
+ {"ods_code": "H81109", "total_number_of_records": 6},
+ ]
+
+ assert actual == expected
+
+
+def test_get_total_number_of_records_larger_mock_data(mock_service, larger_mock_data):
+ mock_dynamo_scan_result, _ = larger_mock_data
+
+ actual = mock_service.get_total_number_of_records(mock_dynamo_scan_result)
+ expected = unordered(
+ [
+ {"ods_code": "H81109", "total_number_of_records": 135 + 246 + 369},
+ {"ods_code": "Y12345", "total_number_of_records": 4812 + 5101},
+ ]
+ )
+
+ assert actual == expected
+
+
+def test_get_number_of_patients(mock_service):
+ actual = mock_service.get_number_of_patients(
+ MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT
+ )
+ expected = unordered(
+ [
+ {"ods_code": "Y12345", "number_of_patients": 1},
+ {"ods_code": "H81109", "number_of_patients": 2},
+ ]
+ )
+
+ assert actual == expected
+
+
+def test_get_metrics_for_total_and_average_file_sizes(mock_service):
+ mock_dynamo_scan_result = MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT
+ mock_s3_list_objects_result = (
+ MOCK_ARF_LIST_OBJECTS_RESULT + MOCK_LG_LIST_OBJECTS_RESULT
+ )
+
+ actual = mock_service.get_metrics_for_total_and_average_file_sizes(
+ mock_dynamo_scan_result, mock_s3_list_objects_result
+ )
+
+ expected = unordered(
+ [
+ {
+ "ods_code": "H81109",
+ "average_size_of_documents_per_patient_in_megabytes": TOTAL_FILE_SIZE_FOR_H81109
+ / 2,
+ "total_size_of_records_in_megabytes": TOTAL_FILE_SIZE_FOR_H81109,
+ },
+ {
+ "ods_code": "Y12345",
+ "average_size_of_documents_per_patient_in_megabytes": TOTAL_FILE_SIZE_FOR_Y12345,
+ "total_size_of_records_in_megabytes": TOTAL_FILE_SIZE_FOR_Y12345,
+ },
+ ]
+ )
+
+ assert actual == expected
+
+
+def test_get_metrics_for_total_and_average_file_sizes_larger_mock_data(
+ mock_service, larger_mock_data
+):
+ mock_dynamo_scan_result, mock_s3_list_objects_result = larger_mock_data
+ actual = mock_service.get_metrics_for_total_and_average_file_sizes(
+ mock_dynamo_scan_result, mock_s3_list_objects_result
+ )
+
+ expected = unordered(
+ [
+ {
+ "ods_code": "H81109",
+ "average_size_of_documents_per_patient_in_megabytes": (123 + 456 + 789)
+ / 3,
+ "total_size_of_records_in_megabytes": (123 + 456 + 789),
+ },
+ {
+ "ods_code": "Y12345",
+ "average_size_of_documents_per_patient_in_megabytes": (9876 + 5432) / 2,
+ "total_size_of_records_in_megabytes": (9876 + 5432),
+ },
+ ]
+ )
+
+ assert actual == expected
+
+
+def test_get_number_of_document_types(mock_service):
+ actual = mock_service.get_number_of_document_types(MOCK_ARF_SCAN_RESULT)
+ expected = unordered(
+ [
+ {"ods_code": "Y12345", "number_of_document_types": 2},
+ {"ods_code": "H81109", "number_of_document_types": 1},
+ ]
+ )
+
+ assert actual == expected
+
+ actual = mock_service.get_number_of_document_types(
+ MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT
+ )
+ expected = unordered(
+ [
+ {"ods_code": "Y12345", "number_of_document_types": 2},
+ {"ods_code": "H81109", "number_of_document_types": 2},
+ ]
+ )
+
+ assert actual == expected
+
+
+def test_get_average_number_of_file_per_patient(mock_service):
+ actual = mock_service.get_average_number_of_files_per_patient(
+ MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT
+ )
+ expected = unordered(
+ [
+ {"ods_code": "Y12345", "average_records_per_patient": 2},
+ {"ods_code": "H81109", "average_records_per_patient": 3},
+ ]
+ )
+
+ assert actual == expected
+
+
+def test_get_average_number_of_file_per_patient_larger_mock_data(
+ mock_service, larger_mock_data
+):
+ mock_dynamo_scan_result, _ = larger_mock_data
+
+ actual = mock_service.get_average_number_of_files_per_patient(
+ mock_dynamo_scan_result
+ )
+ expected = unordered(
+ [
+ {
+ "ods_code": "H81109",
+ "average_records_per_patient": Decimal(135 + 246 + 369) / 3,
+ },
+ {
+ "ods_code": "Y12345",
+ "average_records_per_patient": Decimal(4812 + 5101) / 2,
+ },
+ ]
+ )
+
+ assert actual == expected
diff --git a/lambdas/tests/unit/services/test_login_service.py b/lambdas/tests/unit/services/test_login_service.py
index c034b552c..9501eaefe 100644
--- a/lambdas/tests/unit/services/test_login_service.py
+++ b/lambdas/tests/unit/services/test_login_service.py
@@ -115,7 +115,7 @@ def test_exchange_token_respond_with_auth_token_and_repo_role(
dynamo_state_query_result = {"Count": 1, "Items": [{"id": "state"}]}
mocker.patch.object(
- DynamoDBService, "simple_query", return_value=dynamo_state_query_result
+ DynamoDBService, "query_all_fields", return_value=dynamo_state_query_result
)
mocker.patch.object(DynamoDBService, "delete_item")
@@ -163,7 +163,7 @@ def test_exchange_token_raises_login_error_when_given_state_is_not_in_state_tabl
mock_aws_infras, mock_oidc_service, mocker
):
mocker.patch.object(
- DynamoDBService, "simple_query", return_value={"Count": 0, "Items": []}
+ DynamoDBService, "query_all_fields", return_value={"Count": 0, "Items": []}
)
login_service = LoginService()
@@ -190,7 +190,7 @@ def keys(self):
dynamo_state_query_result = {"Count": 1, "Items": [{"id": "state"}]}
mocker.patch.object(
- DynamoDBService, "simple_query", return_value=dynamo_state_query_result
+ DynamoDBService, "query_all_fields", return_value=dynamo_state_query_result
)
mocker.patch.object(DynamoDBService, "delete_item")
@@ -213,7 +213,7 @@ def test_exchange_token_raises_error_when_encounter_boto3_error(
):
mocker.patch.object(
DynamoDBService,
- "simple_query",
+ "query_all_fields",
side_effect=ClientError(
{"Error": {"Code": "500", "Message": "mocked error"}}, "test"
),
diff --git a/lambdas/tests/unit/services/test_statistical_report_service.py b/lambdas/tests/unit/services/test_statistical_report_service.py
new file mode 100644
index 000000000..ff64f1430
--- /dev/null
+++ b/lambdas/tests/unit/services/test_statistical_report_service.py
@@ -0,0 +1,350 @@
+import tempfile
+from random import shuffle
+from unittest.mock import call
+
+import polars as pl
+import pytest
+from boto3.dynamodb.conditions import Key
+from freezegun import freeze_time
+from models.statistics import ApplicationData
+from polars.testing import assert_frame_equal
+from services.base.dynamo_service import DynamoDBService
+from services.base.s3_service import S3Service
+from services.statistical_report_service import StatisticalReportService
+from tests.unit.conftest import MOCK_STATISTICS_REPORT_BUCKET, MOCK_STATISTICS_TABLE
+from tests.unit.helpers.data.statistic.mock_data_build_utils import (
+ build_random_application_data,
+ build_random_organisation_data,
+ build_random_record_store_data,
+)
+from tests.unit.helpers.data.statistic.mock_statistic_data import (
+ ALL_MOCKED_STATISTIC_DATA,
+ EXPECTED_SUMMARY_APPLICATION_DATA,
+ EXPECTED_SUMMARY_ORGANISATION_DATA,
+ EXPECTED_SUMMARY_RECORD_STORE_DATA,
+ EXPECTED_WEEKLY_SUMMARY,
+ MOCK_APPLICATION_DATA_1,
+ MOCK_APPLICATION_DATA_2,
+ MOCK_APPLICATION_DATA_3,
+ MOCK_DYNAMODB_QUERY_RESPONSE,
+ MOCK_ORGANISATION_DATA_1,
+ MOCK_ORGANISATION_DATA_2,
+ MOCK_ORGANISATION_DATA_3,
+ MOCK_RECORD_STORE_DATA_1,
+ MOCK_RECORD_STORE_DATA_2,
+ MOCK_RECORD_STORE_DATA_3,
+)
+from utils.exceptions import StatisticDataNotFoundException
+
+
+@pytest.fixture
+def mock_service(set_env, mock_s3_service, mock_dynamodb_service):
+ return StatisticalReportService()
+
+
+@pytest.fixture
+def mock_s3_service(mocker):
+ patched_instance = mocker.patch(
+ "services.statistical_report_service.S3Service", spec=S3Service
+ ).return_value
+
+ yield patched_instance
+
+
+@pytest.fixture
+def mock_dynamodb_service(mocker):
+ patched_instance = mocker.patch(
+ "services.statistical_report_service.DynamoDBService", spec=DynamoDBService
+ ).return_value
+
+ yield patched_instance
+
+
+@pytest.fixture
+def mock_temp_folder(mocker):
+ mocker.patch.object(pl.DataFrame, "write_csv")
+ mocker.patch("shutil.rmtree")
+ temp_folder = tempfile.mkdtemp()
+ mocker.patch.object(tempfile, "mkdtemp", return_value=temp_folder)
+ yield temp_folder
+
+
+@freeze_time("2024-06-06T18:00:00Z")
+def test_datetime_correctly_configured_during_initialise(set_env):
+ service = StatisticalReportService()
+
+ assert service.dates_to_collect == [
+ "20240530",
+ "20240531",
+ "20240601",
+ "20240602",
+ "20240603",
+ "20240604",
+ "20240605",
+ ]
+ assert service.report_period == "20240530-20240605"
+
+
+@freeze_time("20240512T07:00:00Z")
+def test_make_weekly_summary(set_env, mocker):
+ data = ALL_MOCKED_STATISTIC_DATA
+ service = StatisticalReportService()
+ service.get_statistic_data = mocker.MagicMock(return_value=data)
+
+ actual = service.make_weekly_summary()
+ expected = EXPECTED_WEEKLY_SUMMARY
+
+ assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False)
+
+
+def test_get_statistic_data(mock_dynamodb_service, mock_service):
+ mock_service.dates_to_collect = ["20240510", "20240511"]
+ mock_dynamodb_service.query_all_fields.side_effect = MOCK_DYNAMODB_QUERY_RESPONSE
+
+ actual = mock_service.get_statistic_data()
+ expected = ALL_MOCKED_STATISTIC_DATA
+
+ assert actual == expected
+
+ expected_calls = [
+ call(
+ table_name=MOCK_STATISTICS_TABLE,
+ key_condition_expression=Key("Date").eq("20240510"),
+ ),
+ call(
+ table_name=MOCK_STATISTICS_TABLE,
+ key_condition_expression=Key("Date").eq("20240511"),
+ ),
+ ]
+
+ mock_dynamodb_service.query_all_fields.assert_has_calls(expected_calls)
+
+
+def test_get_statistic_data_raise_error_if_all_data_are_empty(
+ mock_dynamodb_service, mock_service
+):
+ mock_dynamodb_service.query_all_fields.return_value = {"Items": []}
+
+ with pytest.raises(StatisticDataNotFoundException):
+ mock_service.get_statistic_data()
+
+
+def test_summarise_record_store_data(mock_service):
+ actual = mock_service.summarise_record_store_data(
+ [MOCK_RECORD_STORE_DATA_1, MOCK_RECORD_STORE_DATA_2, MOCK_RECORD_STORE_DATA_3]
+ )
+
+ expected = EXPECTED_SUMMARY_RECORD_STORE_DATA
+
+ assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False)
+
+
+def test_summarise_record_store_data_larger_mock_data(mock_service):
+ mock_data_h81109 = build_random_record_store_data(
+ "H81109", ["20240601", "20240603", "20240604", "20240605", "20240607"]
+ )
+ mock_data_y12345 = build_random_record_store_data(
+ "Y12345", ["20240601", "20240602", "20240603", "20240606"]
+ )
+ mock_record_store_data = mock_data_h81109 + mock_data_y12345
+ shuffle(mock_record_store_data)
+
+ latest_record_in_h81109 = max(mock_data_h81109, key=lambda record: record.date)
+ latest_record_in_y12345 = max(mock_data_y12345, key=lambda record: record.date)
+ expected = pl.DataFrame([latest_record_in_h81109, latest_record_in_y12345]).drop(
+ "date", "statistic_id"
+ )
+
+ actual = mock_service.summarise_record_store_data(mock_record_store_data)
+
+ assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False)
+
+
+def test_summarise_record_store_data_can_handle_empty_input(mock_service):
+ empty_input = []
+ actual = mock_service.summarise_record_store_data(empty_input)
+
+ assert isinstance(actual, pl.DataFrame)
+ assert actual.is_empty()
+
+
+def test_summarise_organisation_data(mock_service):
+ actual = mock_service.summarise_organisation_data(
+ [MOCK_ORGANISATION_DATA_1, MOCK_ORGANISATION_DATA_2, MOCK_ORGANISATION_DATA_3]
+ )
+
+ expected = EXPECTED_SUMMARY_ORGANISATION_DATA
+
+ assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False)
+
+
+def test_summarise_organisation_data_larger_mock_data(mock_service):
+ mock_data_h81109 = build_random_organisation_data(
+ "H81109", ["20240603", "20240604", "20240605", "20240606", "20240607"]
+ )
+ mock_data_y12345 = build_random_organisation_data(
+ "Y12345", ["20240603", "20240604", "20240605", "20240606", "20240607"]
+ )
+ mock_input_data = {"H81109": mock_data_h81109, "Y12345": mock_data_y12345}
+
+ mock_organisation_data = mock_data_h81109 + mock_data_y12345
+ shuffle(mock_organisation_data)
+
+ actual = mock_service.summarise_organisation_data(mock_organisation_data)
+
+ for ods_code in mock_input_data.keys():
+ mock_data_of_ods_code = mock_input_data[ods_code]
+ row_in_actual_data = actual.filter(pl.col("ods_code") == ods_code)
+ assert_weekly_counts_match_sum_of_daily_counts(
+ mock_data_of_ods_code, row_in_actual_data
+ )
+ assert_average_record_per_patient_correct(
+ mock_data_of_ods_code, row_in_actual_data
+ )
+ assert_number_of_patient_correct(mock_data_of_ods_code, row_in_actual_data)
+
+
+def assert_weekly_counts_match_sum_of_daily_counts(mock_data, row_in_actual_data):
+ for count_type in ["viewed", "downloaded", "stored", "deleted"]:
+ expected_weekly_count = sum(
+ getattr(data, f"daily_count_{count_type}") for data in mock_data
+ )
+ actual_weekly_count = row_in_actual_data.item(0, f"weekly_count_{count_type}")
+
+ assert actual_weekly_count == expected_weekly_count
+
+
+def assert_average_record_per_patient_correct(mock_data, row_in_actual_data):
+ expected_average_patient_record = sum(
+ data.average_records_per_patient for data in mock_data
+ ) / len(mock_data)
+ actual_average_patient_record = row_in_actual_data.item(
+ 0, "average_records_per_patient"
+ )
+
+ assert actual_average_patient_record == float(expected_average_patient_record)
+
+
+def assert_number_of_patient_correct(mock_data, row_in_actual_data):
+ most_recent_record_in_mock_data = max(mock_data, key=lambda data: data.date)
+ expected_number_of_patients = most_recent_record_in_mock_data.number_of_patients
+ actual_number_of_patient = row_in_actual_data.item(0, "number_of_patients")
+
+ assert actual_number_of_patient == expected_number_of_patients
+
+
+def test_summarise_organisation_data_can_handle_empty_input(mock_service):
+ empty_input = []
+ actual = mock_service.summarise_organisation_data(empty_input)
+
+ assert isinstance(actual, pl.DataFrame)
+ assert actual.is_empty()
+
+
+def test_summarise_application_data(mock_service):
+ mock_data = [
+ MOCK_APPLICATION_DATA_1,
+ MOCK_APPLICATION_DATA_2,
+ MOCK_APPLICATION_DATA_3,
+ ]
+
+ expected = EXPECTED_SUMMARY_APPLICATION_DATA
+ actual = mock_service.summarise_application_data(mock_data)
+
+ assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False)
+
+
+def test_summarise_application_data_larger_mock_data(mock_service):
+ mock_data_h81109 = build_random_application_data(
+ "H81109", ["20240603", "20240604", "20240605", "20240606", "20240607"]
+ )
+ mock_data_y12345 = build_random_application_data(
+ "Y12345", ["20240603", "20240604", "20240605", "20240606", "20240607"]
+ )
+ mock_organisation_data = mock_data_h81109 + mock_data_y12345
+ shuffle(mock_organisation_data)
+
+ active_users_count_h81109 = count_unique_user_ids(mock_data_h81109)
+ active_users_count_y12345 = count_unique_user_ids(mock_data_y12345)
+
+ expected = pl.DataFrame(
+ [
+ {"ods_code": "H81109", "active_users_count": active_users_count_h81109},
+ {"ods_code": "Y12345", "active_users_count": active_users_count_y12345},
+ ]
+ )
+ actual = mock_service.summarise_application_data(mock_organisation_data)
+
+ assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False)
+
+
+def count_unique_user_ids(mock_data: list[ApplicationData]) -> int:
+ active_users_of_each_day = [set(data.active_user_ids_hashed) for data in mock_data]
+ unique_active_users_for_whole_week = set.union(*active_users_of_each_day)
+ return len(unique_active_users_for_whole_week)
+
+
+def test_summarise_application_data_can_handle_empty_input(mock_service):
+ empty_input = []
+ actual = mock_service.summarise_application_data(empty_input)
+
+ assert isinstance(actual, pl.DataFrame)
+ assert actual.is_empty()
+
+
+def test_join_dataframes_by_ods_code(mock_service):
+ mock_data_1 = pl.DataFrame([{"ods_code": "Y12345", "field1": "apple"}])
+ mock_data_2 = pl.DataFrame(
+ [
+ {"ods_code": "Y12345", "field2": "banana"},
+ {"ods_code": "Z56789", "field2": "cherry"},
+ ]
+ )
+
+ expected = pl.DataFrame(
+ [
+ {"ods_code": "Y12345", "field1": "apple", "field2": "banana"},
+ {"ods_code": "Z56789", "field2": "cherry"},
+ ]
+ )
+ actual = mock_service.join_dataframes_by_ods_code([mock_data_1, mock_data_2])
+
+ assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False)
+
+
+def test_join_dataframes_by_ods_code_can_handle_empty_dataframe(mock_service):
+ mock_data_1 = pl.DataFrame([{"ods_code": "Y12345", "field1": "cat"}])
+ mock_data_2 = pl.DataFrame()
+ mock_data_3 = pl.DataFrame(
+ [
+ {"ods_code": "Y12345", "field2": "dog"},
+ {"ods_code": "Z56789", "field3": "lizard"},
+ ]
+ )
+
+ expected = pl.DataFrame(
+ [
+ {"ods_code": "Y12345", "field1": "cat", "field2": "dog"},
+ {"ods_code": "Z56789", "field3": "lizard"},
+ ]
+ )
+ actual = mock_service.join_dataframes_by_ods_code(
+ [mock_data_1, mock_data_2, mock_data_3]
+ )
+
+ assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False)
+
+
+@freeze_time("20240512T07:00:00Z")
+def test_store_report_to_s3(set_env, mock_s3_service, mock_temp_folder):
+ mock_weekly_summary = EXPECTED_WEEKLY_SUMMARY
+ expected_filename = "statistical_report_20240505-20240511.csv"
+ expected_local_file_path = f"{mock_temp_folder}/{expected_filename}"
+
+ service = StatisticalReportService()
+
+ service.store_report_to_s3(mock_weekly_summary)
+
+ mock_s3_service.upload_file.assert_called_with(
+ expected_local_file_path, MOCK_STATISTICS_REPORT_BUCKET, expected_filename
+ )
diff --git a/lambdas/tests/unit/utils/test_utilities.py b/lambdas/tests/unit/utils/test_utilities.py
index 8dd31f6fe..e81617629 100755
--- a/lambdas/tests/unit/utils/test_utilities.py
+++ b/lambdas/tests/unit/utils/test_utilities.py
@@ -2,6 +2,7 @@
from utils.exceptions import InvalidResourceIdException
from utils.utilities import (
camelize_dict,
+ flatten,
get_file_key_from_s3_url,
redact_id_to_last_4_chars,
validate_nhs_number,
@@ -48,3 +49,12 @@ def test_get_file_key_from_s3_url():
actual = get_file_key_from_s3_url(test_url)
assert actual == expected
+
+
+def test_flatten_reduce_one_level_of_nesting_given_a_nested_list():
+ nested_list = [["a", "b", "c"], ["d", "e"], ["f"], ["a"]]
+ expected = ["a", "b", "c", "d", "e", "f", "a"]
+
+ actual = flatten(nested_list)
+
+ assert actual == expected
diff --git a/lambdas/utils/cloudwatch_logs_query.py b/lambdas/utils/cloudwatch_logs_query.py
new file mode 100644
index 000000000..21d89130a
--- /dev/null
+++ b/lambdas/utils/cloudwatch_logs_query.py
@@ -0,0 +1,53 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class CloudwatchLogsQueryParams:
+ lambda_name: str
+ query_string: str
+
+
+LloydGeorgeRecordsViewed = CloudwatchLogsQueryParams(
+ lambda_name="LloydGeorgeStitchLambda",
+ query_string="""
+ fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code
+ | filter Message = 'User has viewed Lloyd George records'
+ | stats count() AS daily_count_viewed BY ods_code
+ """,
+)
+
+LloydGeorgeRecordsDownloaded = CloudwatchLogsQueryParams(
+ lambda_name="DocumentManifestByNHSNumberLambda",
+ query_string="""
+ fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code
+ | filter Message = 'User has downloaded Lloyd George records'
+ | stats count() AS daily_count_downloaded BY ods_code
+ """,
+)
+
+LloydGeorgeRecordsDeleted = CloudwatchLogsQueryParams(
+ lambda_name="DeleteDocRefLambda",
+ query_string="""
+ fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code
+ | filter Message = "Deleted document of type LG"
+ | stats count() AS daily_count_deleted BY ods_code
+ """,
+)
+
+LloydGeorgeRecordsStored = CloudwatchLogsQueryParams(
+ lambda_name="UploadConfirmResultLambda",
+ query_string="""
+ fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code
+ | filter Message = 'Finished processing all documents'
+ | stats count() AS daily_count_stored BY ods_code
+ """,
+)
+
+UniqueActiveUserIds = CloudwatchLogsQueryParams(
+ lambda_name="AuthoriserLambda",
+ query_string="""
+ fields @timestamp, Authorisation.selected_organisation.org_ods_code AS ods_code, Authorisation.nhs_user_id AS user_id
+ | filter ispresent(ods_code) AND ispresent(user_id)
+ | dedup(ods_code, user_id)
+ """,
+)
diff --git a/lambdas/utils/decorators/ensure_env_var.py b/lambdas/utils/decorators/ensure_env_var.py
index 6a3cb091a..1e0da023b 100644
--- a/lambdas/utils/decorators/ensure_env_var.py
+++ b/lambdas/utils/decorators/ensure_env_var.py
@@ -3,6 +3,7 @@
from enums.lambda_error import LambdaError
from utils.audit_logging_setup import LoggingService
+from utils.exceptions import MissingEnvVarException
from utils.lambda_response import ApiGatewayResponse
logger = LoggingService(__name__)
@@ -37,3 +38,34 @@ def interceptor(event, context):
return interceptor
return wrapper
+
+
+def ensure_environment_variables_for_non_webapi(names: list[str]) -> Callable:
+ """A decorator for lambda handler.
+ Verify that the lambda environment got a set of specific environment variables.
+ If not, log and throw an error.
+ Use for lambdas that are NOT supposed to be integrated with API Gateway.
+
+ Usage:
+ @ensure_environment_variables_for_non_webapi(names=["LLOYD_GEORGE_BUCKET_NAME", "LLOYD_GEORGE_DYNAMODB_NAME"])
+ def lambda_handler(event, context):
+ ...
+ """
+
+ def wrapper(lambda_func: Callable):
+ def interceptor(event, context):
+ missing_env_vars = set(names) - set(os.environ)
+ if missing_env_vars:
+ missing_env_vars_in_string = ", ".join(sorted(missing_env_vars))
+ error_body = LambdaError.EnvMissing.create_error_body(
+ {"name": missing_env_vars_in_string}
+ )
+ logger.error(error_body, {"Result": "Failed to run lambda"})
+ raise MissingEnvVarException(error_body)
+
+ # Validation done. Return control flow to original lambda handler
+ return lambda_func(event, context)
+
+ return interceptor
+
+ return wrapper
diff --git a/lambdas/utils/exceptions.py b/lambdas/utils/exceptions.py
index 5c230edf9..098ae2891 100644
--- a/lambdas/utils/exceptions.py
+++ b/lambdas/utils/exceptions.py
@@ -114,3 +114,11 @@ class FhirResourceNotFound(Exception):
class FileUploadInProgress(Exception):
pass
+
+
+class LogsQueryException(Exception):
+ pass
+
+
+class StatisticDataNotFoundException(Exception):
+ pass
diff --git a/lambdas/utils/utilities.py b/lambdas/utils/utilities.py
index cacd9d4d4..7c3493d59 100755
--- a/lambdas/utils/utilities.py
+++ b/lambdas/utils/utilities.py
@@ -1,3 +1,4 @@
+import itertools
import os
import re
import uuid
@@ -54,3 +55,7 @@ def redact_id_to_last_4_chars(str_id: str) -> str:
def get_file_key_from_s3_url(s3_url: str) -> str:
return urlparse(s3_url).path.lstrip("/")
+
+
+def flatten(nested_list: list[list]) -> list:
+ return list(itertools.chain(*nested_list))
diff --git a/sonar-project.properties b/sonar-project.properties
index 274a61e0a..27633ba18 100644
--- a/sonar-project.properties
+++ b/sonar-project.properties
@@ -14,7 +14,7 @@ sonar.python.coverage.reportPaths=lambdas/coverage.xml
sonar.sources=lambdas/,app/src/
sonar.tests=lambdas/tests/,app/src/
-sonar.exclusions=**/*.test.tsx,app/src/helpers/test/,**/*.story.tsx,**/TestPanel.tsx,lambdas/scripts/,**/*.test.ts,**/*.story.ts,lambdas/tests/
+sonar.exclusions=**/*.test.tsx,app/src/helpers/test/,**/*.story.tsx,**/TestPanel.tsx,lambdas/scripts/,**/*.test.ts,**/*.story.ts,lambdas/tests/*
sonar.test.inclusions=**/*.test.tsx,app/src/helpers/test/,**/*.test.ts
# Encoding of the source code. Default is default system encoding