diff --git a/.github/workflows/base-lambdas-reusable-deploy-all.yml b/.github/workflows/base-lambdas-reusable-deploy-all.yml index 40c2a65e4..b77b59140 100644 --- a/.github/workflows/base-lambdas-reusable-deploy-all.yml +++ b/.github/workflows/base-lambdas-reusable-deploy-all.yml @@ -316,4 +316,32 @@ jobs: lambda_aws_name: UpdateUploadStateLambda lambda_layer_names: "core_lambda_layer" secrets: - AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} \ No newline at end of file + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + + deploy_data_collection_lambda: + name: Deploy data collection lambda + uses: ./.github/workflows/base-lambdas-reusable-deploy.yml + with: + environment: ${{ inputs.environment}} + python_version: ${{ inputs.python_version }} + build_branch: ${{ inputs.build_branch}} + sandbox: ${{ inputs.sandbox }} + lambda_handler_name: data_collection_handler + lambda_aws_name: DataCollectionLambda + lambda_layer_names: "core_lambda_layer,data_lambda_layer" + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} + + deploy_statistical_report_lambda: + name: Deploy statistical report lambda + uses: ./.github/workflows/base-lambdas-reusable-deploy.yml + with: + environment: ${{ inputs.environment}} + python_version: ${{ inputs.python_version }} + build_branch: ${{ inputs.build_branch}} + sandbox: ${{ inputs.sandbox }} + lambda_handler_name: statistical_report_handler + lambda_aws_name: StatisticalReportLambda + lambda_layer_names: "core_lambda_layer,data_lambda_layer" + secrets: + AWS_ASSUME_ROLE: ${{ secrets.AWS_ASSUME_ROLE }} diff --git a/.gitignore b/.gitignore index 59acc5ef5..65d4573e7 100644 --- a/.gitignore +++ b/.gitignore @@ -101,4 +101,7 @@ node_modules/ lambdas/tests/unit/helpers/data/pdf/tmp /lambdas/package_/ -batch_update_progress.json \ No newline at end of file +batch_update_progress.json + +# jupyter notebook files +*.ipynb \ No newline at end of file diff --git a/Makefile b/Makefile index 1908f9f25..f554b76bb 100644 --- a/Makefile +++ b/Makefile @@ -56,8 +56,7 @@ test-unit-coverage: cd ./lambdas && ./venv/bin/python3 -m pytest --cov=. --cov-report xml:coverage.xml test-unit-coverage-html: - cd ./lambdas - coverage run --source=. --omit=tests/* -m pytest -v tests && coverage report && coverage html + cd ./lambdas && coverage run --source=. --omit="tests/*" -m pytest -v tests && coverage report && coverage html test-unit-collect: cd ./lambdas && ./venv/bin/python3 -m pytest tests/ --collect-only diff --git a/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx b/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx index 0c99835e1..364f2076b 100644 --- a/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx +++ b/app/src/components/blocks/_delete/removeRecordStage/RemoveRecordStage.tsx @@ -1,6 +1,6 @@ import React, { Dispatch, SetStateAction, useEffect, useRef, useState } from 'react'; import useTitle from '../../../../helpers/hooks/useTitle'; -import { BackLink, Button, Table, WarningCallout } from 'nhsuk-react-components'; +import { Button, Table, WarningCallout } from 'nhsuk-react-components'; import LinkButton from '../../../generic/linkButton/LinkButton'; import { SearchResult } from '../../../../types/generic/searchResult'; import getDocumentSearchResults from '../../../../helpers/requests/getDocumentSearchResults'; @@ -23,6 +23,7 @@ import { DOCUMENT_TYPE } from '../../../../types/pages/UploadDocumentsPage/types import DeleteResultStage from '../deleteResultStage/DeleteResultStage'; import { DOWNLOAD_STAGE } from '../../../../types/generic/downloadStage'; import PatientSummary from '../../../generic/patientSummary/PatientSummary'; +import BackButton from '../../../generic/backButton/BackButton'; export type Props = { numberOfFiles: number; @@ -89,16 +90,7 @@ function RemoveRecordStage({ numberOfFiles, recordType, setDownloadStage }: Prop const PageIndexView = () => ( <> - { - e.preventDefault(); - navigate(routes.LLOYD_GEORGE); - }} - > - Go back - +

Remove this {recordType} record

Before removing diff --git a/app/src/components/generic/backButton/BackButton.test.tsx b/app/src/components/generic/backButton/BackButton.test.tsx index 06f6a7e9d..2e31cf500 100644 --- a/app/src/components/generic/backButton/BackButton.test.tsx +++ b/app/src/components/generic/backButton/BackButton.test.tsx @@ -26,7 +26,7 @@ describe('BackButton', () => { mockPathname = { pathname: testUrl }; render(); - userEvent.click(screen.getByText('Back')); + userEvent.click(screen.getByText('Go back')); await waitFor(() => { expect(mockUseNavigate).toHaveBeenCalledWith(-1); diff --git a/app/src/components/generic/backButton/BackButton.tsx b/app/src/components/generic/backButton/BackButton.tsx index e76ad9a2f..b151b2b95 100644 --- a/app/src/components/generic/backButton/BackButton.tsx +++ b/app/src/components/generic/backButton/BackButton.tsx @@ -12,8 +12,8 @@ const BackButton = () => { }; return ( - - Back + + Go back ); }; diff --git a/lambdas/enums/supported_document_types.py b/lambdas/enums/supported_document_types.py index f97a7e5fe..a9a0bca0d 100644 --- a/lambdas/enums/supported_document_types.py +++ b/lambdas/enums/supported_document_types.py @@ -39,3 +39,10 @@ def get_dynamodb_table_name(self) -> str: SupportedDocumentTypes.LG: os.getenv("LLOYD_GEORGE_DYNAMODB_NAME"), } return document_type_to_table_name[self] + + def get_s3_bucket_name(self) -> str: + lookup_dict = { + SupportedDocumentTypes.ARF: os.getenv("DOCUMENT_STORE_BUCKET_NAME"), + SupportedDocumentTypes.LG: os.getenv("LLOYD_GEORGE_BUCKET_NAME"), + } + return lookup_dict[self] diff --git a/lambdas/handlers/data_collection_handler.py b/lambdas/handlers/data_collection_handler.py new file mode 100644 index 000000000..673a84d3e --- /dev/null +++ b/lambdas/handlers/data_collection_handler.py @@ -0,0 +1,23 @@ +from services.data_collection_service import DataCollectionService +from utils.audit_logging_setup import LoggingService +from utils.decorators.ensure_env_var import ensure_environment_variables_for_non_webapi +from utils.decorators.override_error_check import override_error_check + +logger = LoggingService(__name__) + + +@ensure_environment_variables_for_non_webapi( + names=[ + "LLOYD_GEORGE_DYNAMODB_NAME", + "LLOYD_GEORGE_BUCKET_NAME", + "DOCUMENT_STORE_DYNAMODB_NAME", + "DOCUMENT_STORE_BUCKET_NAME", + "WORKSPACE", + "STATISTICS_TABLE", + ] +) +@override_error_check +def lambda_handler(_event, _context): + logger.info("Starting data collection process") + service = DataCollectionService() + service.collect_all_data_and_write_to_dynamodb() diff --git a/lambdas/handlers/statistical_report_handler.py b/lambdas/handlers/statistical_report_handler.py new file mode 100644 index 000000000..8b701c7d5 --- /dev/null +++ b/lambdas/handlers/statistical_report_handler.py @@ -0,0 +1,20 @@ +from services.statistical_report_service import StatisticalReportService +from utils.audit_logging_setup import LoggingService +from utils.decorators.ensure_env_var import ensure_environment_variables_for_non_webapi +from utils.decorators.override_error_check import override_error_check + +logger = LoggingService(__name__) + + +@ensure_environment_variables_for_non_webapi( + names=[ + "WORKSPACE", + "STATISTICS_TABLE", + "STATISTICAL_REPORTS_BUCKET", + ] +) +@override_error_check +def lambda_handler(_event, _context): + logger.info("Starting creating statistical report") + service = StatisticalReportService() + service.make_weekly_summary_and_output_to_bucket() diff --git a/lambdas/models/statistics.py b/lambdas/models/statistics.py new file mode 100644 index 000000000..5e5471221 --- /dev/null +++ b/lambdas/models/statistics.py @@ -0,0 +1,88 @@ +import uuid +from decimal import Decimal +from typing import NamedTuple + +from models.config import to_capitalized_camel +from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator + + +class StatisticData(BaseModel): + model_config = ConfigDict( + alias_generator=to_capitalized_camel, populate_by_name=True + ) + statistic_id: str = Field( + default_factory=lambda: str(uuid.uuid4()), alias="StatisticID" + ) + date: str + ods_code: str + + @field_serializer("statistic_id") + def serialise_id(self, statistic_id) -> str: + return f"{self.__class__.__name__}#{statistic_id}" + + # noinspection PyNestedDecorators + @field_validator("statistic_id") + @classmethod + def deserialize_id(cls, raw_statistic_id: str) -> str: + if "#" in raw_statistic_id: + record_type, uuid_part = raw_statistic_id.split("#") + class_name = cls.__name__ + assert ( + record_type == class_name + ), f"StatisticID must be in the form of `{class_name}#uuid`" + else: + uuid_part = raw_statistic_id + + return uuid_part + + # noinspection PyNestedDecorators + @field_validator("ods_code") + @classmethod + def fill_empty_ods_code(cls, ods_code: str) -> str: + if not ods_code: + return "NO_ODS_CODE" + return ods_code + + +class RecordStoreData(StatisticData): + total_number_of_records: int = 0 + number_of_document_types: int = 0 + total_size_of_records_in_megabytes: Decimal = Decimal(0) + average_size_of_documents_per_patient_in_megabytes: Decimal = Decimal(0) + + +class OrganisationData(StatisticData): + number_of_patients: int = 0 + average_records_per_patient: Decimal = Decimal(0) + daily_count_stored: int = 0 + daily_count_viewed: int = 0 + daily_count_downloaded: int = 0 + daily_count_deleted: int = 0 + + +class ApplicationData(StatisticData): + active_user_ids_hashed: list[str] = [] + + +class LoadedStatisticData(NamedTuple): + record_store_data: list[RecordStoreData] + organisation_data: list[OrganisationData] + application_data: list[ApplicationData] + + +def load_from_dynamodb_items(dynamodb_items: list[dict]) -> LoadedStatisticData: + output = LoadedStatisticData([], [], []) + + for item in dynamodb_items: + data_type = item["StatisticID"].split("#")[0] + match data_type: + case "RecordStoreData": + output.record_store_data.append(RecordStoreData.model_validate(item)) + case "OrganisationData": + output.organisation_data.append(OrganisationData.model_validate(item)) + case "ApplicationData": + output.application_data.append(ApplicationData.model_validate(item)) + case _: + raise ValueError(f"unknown type of statistic data: {data_type}") + + return output diff --git a/lambdas/requirements/requirements_test.txt b/lambdas/requirements/requirements_test.txt index ad58448c0..18c7af468 100644 --- a/lambdas/requirements/requirements_test.txt +++ b/lambdas/requirements/requirements_test.txt @@ -6,6 +6,7 @@ isort==5.13.0 pip-audit==2.6.1 pytest-cov==4.1.0 pytest-mock==3.11.1 +pytest-unordered==0.6.0 pytest==7.4.3 requests_mock==1.11.0 ruff==0.0.284 diff --git a/lambdas/services/authoriser_service.py b/lambdas/services/authoriser_service.py index 6cc107d63..42fb76c16 100644 --- a/lambdas/services/authoriser_service.py +++ b/lambdas/services/authoriser_service.py @@ -76,7 +76,7 @@ def find_login_session(self, ndr_session_id): ) session_table_name = os.environ["AUTH_SESSION_TABLE_NAME"] db_service = DynamoDBService() - query_response = db_service.simple_query( + query_response = db_service.query_all_fields( table_name=session_table_name, key_condition_expression=Key("NDRSessionId").eq(ndr_session_id), ) diff --git a/lambdas/services/base/cloudwatch_service.py b/lambdas/services/base/cloudwatch_service.py new file mode 100644 index 000000000..5b30c6fa3 --- /dev/null +++ b/lambdas/services/base/cloudwatch_service.py @@ -0,0 +1,59 @@ +import os +import time + +import boto3 +from utils.audit_logging_setup import LoggingService +from utils.cloudwatch_logs_query import CloudwatchLogsQueryParams +from utils.exceptions import LogsQueryException + +logger = LoggingService(__name__) + + +class CloudwatchService: + def __init__(self): + self.logs_client = boto3.client("logs") + self.workspace = os.environ["WORKSPACE"] + self.initialised = True + + def query_logs( + self, query_params: CloudwatchLogsQueryParams, start_time: int, end_time: int + ) -> list[dict]: + response = self.logs_client.start_query( + logGroupName=f"/aws/lambda/{self.workspace}_{query_params.lambda_name}", + startTime=start_time, + endTime=end_time, + queryString=query_params.query_string, + ) + query_id = response["queryId"] + + raw_query_result = self.poll_query_result(query_id) + query_result = self.regroup_raw_query_result(raw_query_result) + return query_result + + def poll_query_result(self, query_id: str, max_retries=20) -> list[list]: + for _ in range(max_retries): + response = self.logs_client.get_query_results(queryId=query_id) + if response["status"] == "Complete": + return response["results"] + elif response["status"] in ["Failed", "Cancelled", "Timeout"]: + self.log_and_raise_error( + f"Logs query failed with status: {response['status']}" + ) + time.sleep(1) + + self.log_and_raise_error( + f"Failed to get query result within max retries of {max_retries} times" + ) + + @staticmethod + def regroup_raw_query_result(raw_query_result: list[list[dict]]) -> list[dict]: + query_result = [ + {column["field"]: column["value"] for column in row} + for row in raw_query_result + ] + return query_result + + @staticmethod + def log_and_raise_error(error_message: str) -> None: + logger.error(error_message) + raise LogsQueryException(error_message) diff --git a/lambdas/services/base/dynamo_service.py b/lambdas/services/base/dynamo_service.py index 94adc1531..6fd9c592e 100644 --- a/lambdas/services/base/dynamo_service.py +++ b/lambdas/services/base/dynamo_service.py @@ -1,3 +1,5 @@ +from typing import Optional + import boto3 from boto3.dynamodb.conditions import Attr, ConditionBase, Key from botocore.exceptions import ClientError @@ -76,7 +78,7 @@ def query_with_requested_fields( logger.error(str(e), {"Result": f"Unable to query table: {table_name}"}) raise e - def simple_query(self, table_name: str, key_condition_expression): + def query_all_fields(self, table_name: str, key_condition_expression): """ Allow querying dynamodb table without explicitly defining the fields to retrieve. :param table_name: Dynamodb table name @@ -85,7 +87,7 @@ def simple_query(self, table_name: str, key_condition_expression): example usage: from boto3.dynamodb.conditions import Key - query_response = db_service.simple_query( + query_response = db_service.query_all_fields( table_name=session_table_name, key_condition_expression=Key("NDRSessionId").eq(ndr_session_id) ) @@ -161,6 +163,35 @@ def scan_table( logger.error(str(e), {"Result": f"Unable to scan table: {table_name}"}) raise e + def scan_whole_table( + self, + table_name: str, + project_expression: Optional[str] = None, + filter_expression: Optional[str] = None, + ) -> list[dict]: + try: + table = self.get_table(table_name) + scan_arguments = {} + if project_expression: + scan_arguments["ProjectionExpression"] = project_expression + if filter_expression: + scan_arguments["FilterExpression"] = filter_expression + + paginated_result = table.scan(**scan_arguments) + dynamodb_scan_result = paginated_result.get("Items", []) + while "LastEvaluatedKey" in paginated_result: + start_key_for_next_page = paginated_result["LastEvaluatedKey"] + paginated_result = table.scan( + **scan_arguments, + ExclusiveStartKey=start_key_for_next_page, + ) + dynamodb_scan_result += paginated_result["Items"] + return dynamodb_scan_result + + except ClientError as e: + logger.error(str(e), {"Result": f"Unable to scan table: {table_name}"}) + raise e + def batch_writing(self, table_name: str, item_list: list[dict]): try: table = self.get_table(table_name) diff --git a/lambdas/services/base/s3_service.py b/lambdas/services/base/s3_service.py index 25ca9861c..cbcd33d07 100644 --- a/lambdas/services/base/s3_service.py +++ b/lambdas/services/base/s3_service.py @@ -132,3 +132,10 @@ def file_exist_on_s3(self, s3_bucket_name: str, file_key: str) -> bool: return False logger.error(str(e), {"Result": "Failed to check if file exists on s3"}) raise e + + def list_all_objects(self, bucket_name: str) -> list[dict]: + s3_paginator = self.client.get_paginator("list_objects_v2") + s3_list_objects_result = [] + for paginated_result in s3_paginator.paginate(Bucket=bucket_name): + s3_list_objects_result += paginated_result.get("Contents", []) + return s3_list_objects_result diff --git a/lambdas/services/data_collection_service.py b/lambdas/services/data_collection_service.py new file mode 100644 index 000000000..d1320cc2a --- /dev/null +++ b/lambdas/services/data_collection_service.py @@ -0,0 +1,337 @@ +import hashlib +import os +from collections import Counter, defaultdict +from datetime import datetime + +import polars as pl +from enums.metadata_field_names import DocumentReferenceMetadataFields +from enums.supported_document_types import SupportedDocumentTypes +from models.statistics import ( + ApplicationData, + OrganisationData, + RecordStoreData, + StatisticData, +) +from services.base.cloudwatch_service import CloudwatchService +from services.base.dynamo_service import DynamoDBService +from services.base.s3_service import S3Service +from utils.audit_logging_setup import LoggingService +from utils.cloudwatch_logs_query import ( + CloudwatchLogsQueryParams, + LloydGeorgeRecordsDeleted, + LloydGeorgeRecordsDownloaded, + LloydGeorgeRecordsStored, + LloydGeorgeRecordsViewed, + UniqueActiveUserIds, +) +from utils.common_query_filters import UploadCompleted +from utils.utilities import flatten, get_file_key_from_s3_url + +logger = LoggingService(__name__) + +Fields = DocumentReferenceMetadataFields +OdsCodeFieldName: str = Fields.CURRENT_GP_ODS.value +NhsNumberFieldName: str = Fields.NHS_NUMBER.value +FileLocationFieldName: str = Fields.FILE_LOCATION.value +ContentTypeFieldName: str = Fields.CONTENT_TYPE.value + + +class DataCollectionService: + def __init__(self): + self.workspace = os.environ["WORKSPACE"] + self.output_table_name = os.environ["STATISTICS_TABLE"] + + self.cloudwatch_service = CloudwatchService() + self.dynamodb_service = DynamoDBService() + self.s3_service = S3Service() + + one_day = 60 * 60 * 24 + time_now = int(datetime.now().timestamp()) + + self.collection_start_time = time_now - one_day + self.collection_end_time = time_now + self.today_date = datetime.today().strftime("%Y%m%d") + + def collect_all_data_and_write_to_dynamodb(self): + time_period_human_readable = ( + f"{datetime.fromtimestamp(self.collection_start_time)}" + f" ~ {datetime.fromtimestamp(self.collection_end_time)}" + ) + logger.info(f"Collecting data between {time_period_human_readable}.") + + all_statistic_data = self.collect_all_data() + logger.info("Finished collecting data. Will output to dynamodb table.") + + self.write_to_dynamodb_table(all_statistic_data) + logger.info("Data collection completed.", {"Result": "Successful"}) + + def collect_all_data(self) -> list[StatisticData]: + dynamodb_scan_result = self.scan_dynamodb_tables() + s3_list_objects_result = self.get_all_s3_files_info() + + record_store_data = self.get_record_store_data( + dynamodb_scan_result, s3_list_objects_result + ) + organisation_data = self.get_organisation_data(dynamodb_scan_result) + application_data = self.get_application_data() + + return record_store_data + organisation_data + application_data + + def write_to_dynamodb_table(self, all_statistic_data: list[StatisticData]): + logger.info("Writing statistic data to dynamodb table") + item_list = [] + for entry in all_statistic_data: + item_list.append(entry.model_dump(by_alias=True)) + + self.dynamodb_service.batch_writing( + table_name=self.output_table_name, item_list=item_list + ) + + logger.info("Finish writing all data to dynamodb table") + + def scan_dynamodb_tables(self) -> list[dict]: + all_results = [] + + field_names_to_fetch = [ + OdsCodeFieldName, + NhsNumberFieldName, + FileLocationFieldName, + ContentTypeFieldName, + ] + project_expression = ",".join(field_names_to_fetch) + filter_expression = UploadCompleted + + for doc_type in SupportedDocumentTypes.list(): + table_name = doc_type.get_dynamodb_table_name() + result = self.dynamodb_service.scan_whole_table( + table_name=table_name, + project_expression=project_expression, + filter_expression=filter_expression, + ) + all_results.extend(result) + + return all_results + + def get_all_s3_files_info(self) -> list[dict]: + all_results = [] + for doc_type in SupportedDocumentTypes.list(): + bucket_name = doc_type.get_s3_bucket_name() + result = self.s3_service.list_all_objects(bucket_name) + all_results += result + + return all_results + + def get_record_store_data( + self, + dynamodb_scan_result: list[dict], + s3_list_objects_result: list[dict], + ) -> list[RecordStoreData]: + total_number_of_records = self.get_total_number_of_records(dynamodb_scan_result) + + total_and_average_file_sizes = ( + self.get_metrics_for_total_and_average_file_sizes( + dynamodb_scan_result, s3_list_objects_result + ) + ) + + number_of_document_types = self.get_number_of_document_types( + dynamodb_scan_result + ) + + joined_query_result = self.join_results_by_ods_code( + [ + total_number_of_records, + total_and_average_file_sizes, + number_of_document_types, + ] + ) + + record_store_data_for_all_ods_code = [ + RecordStoreData( + date=self.today_date, + **record_store_data_properties, + ) + for record_store_data_properties in joined_query_result + ] + + return record_store_data_for_all_ods_code + + def get_organisation_data( + self, dynamodb_scan_result: list[dict] + ) -> list[OrganisationData]: + number_of_patients = self.get_number_of_patients(dynamodb_scan_result) + average_records_per_patient = self.get_average_number_of_files_per_patient( + dynamodb_scan_result + ) + daily_count_viewed = self.get_cloud_watch_query_result(LloydGeorgeRecordsViewed) + daily_count_downloaded = self.get_cloud_watch_query_result( + LloydGeorgeRecordsDownloaded + ) + daily_count_deleted = self.get_cloud_watch_query_result( + LloydGeorgeRecordsDeleted + ) + daily_count_stored = self.get_cloud_watch_query_result(LloydGeorgeRecordsStored) + + joined_query_result = self.join_results_by_ods_code( + [ + number_of_patients, + average_records_per_patient, + daily_count_viewed, + daily_count_downloaded, + daily_count_deleted, + daily_count_stored, + ] + ) + + organisation_data_for_all_ods_code = [ + OrganisationData(date=self.today_date, **organisation_data_properties) + for organisation_data_properties in joined_query_result + ] + + return organisation_data_for_all_ods_code + + def get_application_data(self) -> list[ApplicationData]: + user_id_per_ods_code = self.get_active_user_list() + application_data_for_all_ods_code = [ + ApplicationData( + date=self.today_date, + active_user_ids_hashed=active_user_ids_hashed, + ods_code=ods_code, + ) + for ods_code, active_user_ids_hashed in user_id_per_ods_code.items() + ] + return application_data_for_all_ods_code + + def get_active_user_list(self) -> dict[str, list]: + query_result = self.get_cloud_watch_query_result( + query_params=UniqueActiveUserIds + ) + user_ids_per_ods_code = defaultdict(list) + for entry in query_result: + ods_code = entry.get("ods_code") + user_id = entry.get("user_id") + hashed_user_id = hashlib.sha256(bytes(user_id, "utf8")).hexdigest() + user_ids_per_ods_code[ods_code].append(hashed_user_id) + + return user_ids_per_ods_code + + def get_cloud_watch_query_result( + self, query_params: CloudwatchLogsQueryParams + ) -> list[dict]: + return self.cloudwatch_service.query_logs( + query_params=query_params, + start_time=self.collection_start_time, + end_time=self.collection_end_time, + ) + + def get_total_number_of_records( + self, dynamodb_scan_result: list[dict] + ) -> list[dict]: + ods_code_for_every_document = [ + item[OdsCodeFieldName] for item in dynamodb_scan_result + ] + counted_by_ods_code = Counter(ods_code_for_every_document) + count_result_in_list_of_dict = [ + {"ods_code": key, "total_number_of_records": value} + for key, value in counted_by_ods_code.items() + ] + return count_result_in_list_of_dict + + def get_number_of_patients(self, dynamodb_scan_result: list[dict]) -> list[dict]: + patients_grouped_by_ods_code = defaultdict(set) + for item in dynamodb_scan_result: + ods_code = item[OdsCodeFieldName] + patients_grouped_by_ods_code[ods_code].add(item[NhsNumberFieldName]) + + return [ + {"ods_code": ods_code, "number_of_patients": len(patients)} + for ods_code, patients in patients_grouped_by_ods_code.items() + ] + + def get_metrics_for_total_and_average_file_sizes( + self, + dynamodb_scan_result: list[dict], + s3_list_objects_result: list[dict], + ) -> list[dict]: + dynamodb_df = pl.DataFrame(dynamodb_scan_result) + s3_df = pl.DataFrame(s3_list_objects_result) + + dynamodb_df_with_file_key = dynamodb_df.with_columns( + pl.col(FileLocationFieldName) + .map_elements(get_file_key_from_s3_url, return_dtype=pl.String) + .alias("S3FileKey") + ) + joined_df = dynamodb_df_with_file_key.join( + s3_df, how="left", left_on="S3FileKey", right_on="Key", coalesce=True + ) + + get_total_size = (pl.col("Size").sum() / 1024 / 1024).alias( + "TotalFileSizeForPatientInMegabytes" + ) + + get_average_file_size_per_patient = ( + pl.col("TotalFileSizeForPatientInMegabytes") + .mean() + .alias("average_size_of_documents_per_patient_in_megabytes") + ) + + get_total_file_size_for_ods_code = ( + pl.col("TotalFileSizeForPatientInMegabytes") + .sum() + .alias("total_size_of_records_in_megabytes") + ) + + result = ( + joined_df.group_by(OdsCodeFieldName, NhsNumberFieldName) + .agg(get_total_size) + .group_by(OdsCodeFieldName) + .agg(get_average_file_size_per_patient, get_total_file_size_for_ods_code) + .rename({OdsCodeFieldName: "ods_code"}) + ) + + return result.to_dicts() + + def get_number_of_document_types( + self, dynamodb_scan_result: list[dict] + ) -> list[dict]: + file_types_grouped_by_ods_code = defaultdict(set) + for item in dynamodb_scan_result: + ods_code = item[OdsCodeFieldName] + file_type = item[ContentTypeFieldName] + file_types_grouped_by_ods_code[ods_code].add(file_type) + + return [ + {"ods_code": ods_code, "number_of_document_types": len(file_types)} + for ods_code, file_types in file_types_grouped_by_ods_code.items() + ] + + def get_average_number_of_files_per_patient( + self, + dynamodb_scan_result: list[dict], + ) -> list[dict]: + dynamodb_df = pl.DataFrame(dynamodb_scan_result) + + count_records = pl.len().alias("number_of_records") + take_average_of_record_count = ( + pl.col("number_of_records").mean().alias("average_records_per_patient") + ) + + result = ( + dynamodb_df.group_by(OdsCodeFieldName, NhsNumberFieldName) + .agg(count_records) + .group_by(OdsCodeFieldName) + .agg(take_average_of_record_count) + .rename({OdsCodeFieldName: "ods_code"}) + ) + return result.to_dicts() + + @staticmethod + def join_results_by_ods_code(results: list[list[dict]]) -> list[dict]: + joined_by_ods_code = defaultdict(dict) + for entry in flatten(results): + ods_code = entry["ods_code"] + joined_by_ods_code[ods_code].update(entry) + + joined_result = list(joined_by_ods_code.values()) + + return joined_result diff --git a/lambdas/services/login_service.py b/lambdas/services/login_service.py index a18907cec..bfafc1d2b 100644 --- a/lambdas/services/login_service.py +++ b/lambdas/services/login_service.py @@ -142,7 +142,7 @@ def generate_session(self, state, auth_code) -> dict: def have_matching_state_value_in_record(self, state: str) -> bool: state_table_name = os.environ["AUTH_STATE_TABLE_NAME"] - query_response = self.db_service.simple_query( + query_response = self.db_service.query_all_fields( table_name=state_table_name, key_condition_expression=Key("State").eq(state) ) diff --git a/lambdas/services/statistical_report_service.py b/lambdas/services/statistical_report_service.py new file mode 100644 index 000000000..e2f527f32 --- /dev/null +++ b/lambdas/services/statistical_report_service.py @@ -0,0 +1,225 @@ +import os +import shutil +import tempfile +from datetime import datetime, timedelta + +import polars as pl +import polars.selectors as column_select +from boto3.dynamodb.conditions import Key +from inflection import humanize +from models.statistics import ( + ApplicationData, + LoadedStatisticData, + OrganisationData, + RecordStoreData, + StatisticData, + load_from_dynamodb_items, +) +from services.base.dynamo_service import DynamoDBService +from services.base.s3_service import S3Service +from utils.audit_logging_setup import LoggingService +from utils.exceptions import StatisticDataNotFoundException + +logger = LoggingService(__name__) + + +class StatisticalReportService: + def __init__(self): + self.dynamo_service = DynamoDBService() + self.statistic_table = os.environ["STATISTICS_TABLE"] + + self.s3_service = S3Service() + self.reports_bucket = os.environ["STATISTICAL_REPORTS_BUCKET"] + + last_seven_days = [ + datetime.today() - timedelta(days=i) for i in range(7, 0, -1) + ] + self.dates_to_collect: list[str] = [ + date.strftime("%Y%m%d") for date in last_seven_days + ] + self.report_period = f"{self.dates_to_collect[0]}-{self.dates_to_collect[-1]}" + + def make_weekly_summary_and_output_to_bucket(self) -> None: + weekly_summary = self.make_weekly_summary() + self.store_report_to_s3(weekly_summary) + + def make_weekly_summary(self) -> pl.DataFrame: + (record_store_data, organisation_data, application_data) = ( + self.get_statistic_data() + ) + + weekly_record_store_data = self.summarise_record_store_data(record_store_data) + weekly_organisation_data = self.summarise_organisation_data(organisation_data) + weekly_application_data = self.summarise_application_data(application_data) + + all_summarised_data = [ + weekly_record_store_data, + weekly_organisation_data, + weekly_application_data, + ] + + combined_data = self.join_dataframes_by_ods_code(all_summarised_data) + weekly_summary = self.tidy_up_data(combined_data) + + return weekly_summary + + def get_statistic_data(self) -> LoadedStatisticData: + logger.info("Loading statistic data of previous week from dynamodb...") + logger.info(f"The period to report: {self.dates_to_collect}") + dynamodb_items = [] + for date in self.dates_to_collect: + response = self.dynamo_service.query_all_fields( + table_name=self.statistic_table, + key_condition_expression=Key("Date").eq(date), + ) + dynamodb_items.extend(response["Items"]) + + loaded_data = load_from_dynamodb_items(dynamodb_items) + + all_data_empty = all(not data for data in loaded_data) + if all_data_empty: + logger.error( + f"No statistic data can be found during the period {self.report_period}. " + "Please check whether the data collection lambda worked properly.", + {"Result": "Statistic data not available."}, + ) + raise StatisticDataNotFoundException( + f"No statistic data can be found during the period {self.report_period}" + ) + + return loaded_data + + @staticmethod + def load_data_to_polars(data: list[StatisticData]) -> pl.DataFrame: + cast_decimal_to_float = column_select.by_dtype(pl.datatypes.Decimal).cast( + pl.Float64 + ) + loaded_data = pl.DataFrame(data).with_columns(cast_decimal_to_float) + return loaded_data + + def summarise_record_store_data( + self, record_store_data: list[RecordStoreData] + ) -> pl.DataFrame: + logger.info("Summarising RecordStoreData...") + if not record_store_data: + logger.info("RecordStoreData for this period was empty.") + return pl.DataFrame() + + df = self.load_data_to_polars(record_store_data) + + select_most_recent_records = pl.all().sort_by("date").last() + summarised_data = ( + df.group_by("ods_code") + .agg(select_most_recent_records) + .drop("date", "statistic_id") + ) + + return summarised_data + + def summarise_organisation_data( + self, organisation_data: list[OrganisationData] + ) -> pl.DataFrame: + logger.info("Summarising OrganisationData...") + if not organisation_data: + logger.info("OrganisationData for this period was empty.") + return pl.DataFrame() + + df = self.load_data_to_polars(organisation_data) + + sum_daily_count_to_weekly = ( + column_select.matches("daily") + .sum() + .name.map(lambda column_name: column_name.replace("daily", "weekly")) + ) + take_average_for_patient_record = ( + pl.col("average_records_per_patient").mean().round(3) + ) + select_most_recent_number_of_patients = ( + pl.col("number_of_patients").sort_by("date").last() + ) + summarised_data = df.group_by("ods_code").agg( + sum_daily_count_to_weekly, + take_average_for_patient_record, + select_most_recent_number_of_patients, + ) + return summarised_data + + def summarise_application_data( + self, application_data: list[ApplicationData] + ) -> pl.DataFrame: + logger.info("Summarising ApplicationData...") + if not application_data: + logger.info("ApplicationData for this period was empty.") + return pl.DataFrame() + + df = self.load_data_to_polars(application_data) + + count_unique_ids = ( + pl.concat_list("active_user_ids_hashed") + .flatten() + .unique() + .len() + .alias("active_users_count") + ) + summarised_data = df.group_by("ods_code").agg(count_unique_ids) + return summarised_data + + def join_dataframes_by_ods_code( + self, all_summarised_data: list[pl.DataFrame] + ) -> pl.DataFrame: + data_to_report = [df for df in all_summarised_data if not df.is_empty()] + joined_dataframe = data_to_report[0] + + for other_dataframe in data_to_report[1:]: + joined_dataframe = joined_dataframe.join( + other_dataframe, on="ods_code", how="outer_coalesce" + ) + + return joined_dataframe + + def tidy_up_data(self, joined_data: pl.DataFrame) -> pl.DataFrame: + with_date_column_updated = self.update_date_column(joined_data) + with_columns_reordered = self.reorder_columns(with_date_column_updated) + with_columns_renamed = with_columns_reordered.rename( + self.rename_snakecase_columns + ) + + return with_columns_renamed + + def update_date_column(self, joined_data: pl.DataFrame) -> pl.DataFrame: + date_column_filled_with_report_period = joined_data.with_columns( + pl.lit(self.report_period).alias("date") + ) + return date_column_filled_with_report_period + + def reorder_columns(self, joined_data: pl.DataFrame) -> pl.DataFrame: + all_columns_names = joined_data.columns + columns_to_go_first = ["date", "ods_code"] + other_columns = sorted(set(all_columns_names) - set(columns_to_go_first)) + with_columns_reordered = joined_data.select( + *columns_to_go_first, *other_columns + ) + return with_columns_reordered + + @staticmethod + def rename_snakecase_columns(column_name: str) -> str: + if column_name == "ods_code": + return "ODS code" + else: + return humanize(column_name) + + def store_report_to_s3(self, weekly_summary: pl.DataFrame) -> None: + logger.info("Saving the weekly report as .csv") + file_name = f"statistical_report_{self.report_period}.csv" + temp_folder = tempfile.mkdtemp() + local_file_path = os.path.join(temp_folder, file_name) + try: + weekly_summary.write_csv(local_file_path) + + logger.info("Uploading the csv report to S3 bucket...") + self.s3_service.upload_file(local_file_path, self.reports_bucket, file_name) + + logger.info("The weekly report is stored in s3 bucket.") + logger.info(f"File name: {file_name}") + finally: + shutil.rmtree(temp_folder) diff --git a/lambdas/tests/unit/conftest.py b/lambdas/tests/unit/conftest.py index 5eea7fcff..9fc7c630e 100644 --- a/lambdas/tests/unit/conftest.py +++ b/lambdas/tests/unit/conftest.py @@ -1,4 +1,5 @@ import json +import tempfile from dataclasses import dataclass from enum import Enum from unittest import mock @@ -44,6 +45,8 @@ MOCK_APPCONFIG_APPLICATION_ENV_NAME = "APPCONFIG_APPLICATION" MOCK_APPCONFIG_ENVIRONMENT_ENV_NAME = "APPCONFIG_ENVIRONMENT" MOCK_APPCONFIG_CONFIGURATION_ENV_NAME = "APPCONFIG_CONFIGURATION" +MOCK_STATISTICS_TABLE_NAME = "STATISTICS_TABLE" +MOCK_STATISTICS_REPORT_BUCKET_NAME = "STATISTICAL_REPORTS_BUCKET" MOCK_ARF_TABLE_NAME = "test_arf_dynamoDB_table" MOCK_LG_TABLE_NAME = "test_lg_dynamoDB_table" @@ -55,6 +58,8 @@ MOCK_LG_STAGING_STORE_BUCKET = "test_staging_bulk_store" MOCK_LG_METADATA_SQS_QUEUE = "test_bulk_upload_metadata_queue" MOCK_LG_INVALID_SQS_QUEUE = "INVALID_SQS_QUEUE_URL" +MOCK_STATISTICS_TABLE = "test_statistics_table" +MOCK_STATISTICS_REPORT_BUCKET = "test_statistics_report_bucket" TEST_NHS_NUMBER = "9000000009" TEST_OBJECT_KEY = "1234-4567-8912-HSDF-TEST" @@ -137,16 +142,20 @@ def set_env(monkeypatch): ) monkeypatch.setenv( MOCK_APPCONFIG_APPLICATION_ENV_NAME, MOCK_APPCONFIG_APPLICATION_ID - ), + ) monkeypatch.setenv( MOCK_APPCONFIG_ENVIRONMENT_ENV_NAME, MOCK_APPCONFIG_ENVIRONMENT_ID - ), + ) monkeypatch.setenv( MOCK_APPCONFIG_CONFIGURATION_ENV_NAME, MOCK_APPCONFIG_CONFIGURATION_ID ) monkeypatch.setenv( MOCK_PRESIGNED_URL_ROLE_ARN_KEY, MOCK_PRESIGNED_URL_ROLE_ARN_VALUE ) + monkeypatch.setenv(MOCK_STATISTICS_TABLE_NAME, MOCK_STATISTICS_TABLE) + monkeypatch.setenv( + MOCK_STATISTICS_REPORT_BUCKET_NAME, MOCK_STATISTICS_REPORT_BUCKET + ) @pytest.fixture(scope="session", autouse=True) @@ -235,3 +244,10 @@ class MockError(Enum): "err_code": "AB_XXXX", "interaction_id": "88888888-4444-4444-4444-121212121212", } + + +@pytest.fixture +def mock_temp_folder(mocker): + temp_folder = tempfile.mkdtemp() + mocker.patch.object(tempfile, "mkdtemp", return_value=temp_folder) + yield temp_folder diff --git a/lambdas/tests/unit/enums/test_supported_document_types.py b/lambdas/tests/unit/enums/test_supported_document_types.py index bd69cc1d1..c2685abc5 100644 --- a/lambdas/tests/unit/enums/test_supported_document_types.py +++ b/lambdas/tests/unit/enums/test_supported_document_types.py @@ -1,6 +1,11 @@ import pytest from enums.supported_document_types import SupportedDocumentTypes -from tests.unit.conftest import MOCK_ARF_TABLE_NAME, MOCK_LG_TABLE_NAME +from tests.unit.conftest import ( + MOCK_ARF_BUCKET, + MOCK_ARF_TABLE_NAME, + MOCK_LG_BUCKET, + MOCK_LG_TABLE_NAME, +) @pytest.mark.parametrize( @@ -15,3 +20,17 @@ def test_get_dynamodb_table_name_return_table_name(set_env, doc_type, expected): actual = doc_type_enum.get_dynamodb_table_name() assert actual == expected + + +@pytest.mark.parametrize( + ["doc_type", "expected"], + [ + (SupportedDocumentTypes.ARF, MOCK_ARF_BUCKET), + (SupportedDocumentTypes.LG, MOCK_LG_BUCKET), + ], +) +def test_get_s3_bucket_name_return_bucket_name(set_env, doc_type, expected): + doc_type_enum = SupportedDocumentTypes(doc_type) + actual = doc_type_enum.get_s3_bucket_name() + + assert actual == expected diff --git a/lambdas/tests/unit/handlers/test_data_collection_handler.py b/lambdas/tests/unit/handlers/test_data_collection_handler.py new file mode 100644 index 000000000..fb6e5aa6f --- /dev/null +++ b/lambdas/tests/unit/handlers/test_data_collection_handler.py @@ -0,0 +1,22 @@ +import pytest +from handlers.data_collection_handler import lambda_handler +from services.data_collection_service import DataCollectionService +from utils.exceptions import MissingEnvVarException + + +def test_lambda_handler_call_underlying_service(mocker, context, set_env): + mock_data_collection_service = mocker.patch( + "handlers.data_collection_handler.DataCollectionService", + spec=DataCollectionService, + ).return_value + + lambda_handler(None, context) + + mock_data_collection_service.collect_all_data_and_write_to_dynamodb.assert_called_once() + + +def test_lambda_handler_check_for_env_vars(context): + with pytest.raises(MissingEnvVarException) as error: + lambda_handler(None, context) + + assert "An error occurred due to missing environment variable" in str(error.value) diff --git a/lambdas/tests/unit/handlers/test_statistical_report_handler.py b/lambdas/tests/unit/handlers/test_statistical_report_handler.py new file mode 100644 index 000000000..fa145e7fb --- /dev/null +++ b/lambdas/tests/unit/handlers/test_statistical_report_handler.py @@ -0,0 +1,22 @@ +import pytest +from handlers.statistical_report_handler import lambda_handler +from services.statistical_report_service import StatisticalReportService +from utils.exceptions import MissingEnvVarException + + +def test_lambda_handler_call_underlying_service(mocker, context, set_env): + mock_statistical_report_service = mocker.patch( + "handlers.statistical_report_handler.StatisticalReportService", + spec=StatisticalReportService, + ).return_value + + lambda_handler(None, context) + + mock_statistical_report_service.make_weekly_summary_and_output_to_bucket.assert_called_once() + + +def test_lambda_handler_check_for_env_vars(context): + with pytest.raises(MissingEnvVarException) as error: + lambda_handler(None, context) + + assert "An error occurred due to missing environment variable" in str(error.value) diff --git a/lambdas/tests/unit/helpers/data/dynamo_scan_response.py b/lambdas/tests/unit/helpers/data/dynamo_scan_response.py index e39976e4a..962ba2484 100644 --- a/lambdas/tests/unit/helpers/data/dynamo_scan_response.py +++ b/lambdas/tests/unit/helpers/data/dynamo_scan_response.py @@ -1,5 +1,7 @@ from copy import deepcopy +from utils.utilities import flatten + MOCK_RESPONSE = { "Items": [ { @@ -106,3 +108,78 @@ "RetryAttempts": 0, }, } + +MOCK_PAGINATED_RESPONSE_1 = { + "Items": [ + { + "FileName": "1of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + { + "FileName": "3of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + { + "FileName": "5of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + { + "FileName": "7of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + ], + "Count": 4, + "ScannedCount": 4, + "LastEvaluatedKey": {"ID": "id_token_for_page_2"}, +} + + +MOCK_PAGINATED_RESPONSE_2 = { + "Items": [ + { + "FileName": "2of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + { + "FileName": "10of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + { + "FileName": "8of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + { + "FileName": "6of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + ], + "Count": 4, + "ScannedCount": 4, + "LastEvaluatedKey": {"ID": "id_token_for_page_3"}, +} + +MOCK_PAGINATED_RESPONSE_3 = { + "Items": [ + { + "FileName": "9of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + { + "FileName": "4of10_Lloyd_George_Record_[Jane Smith]_[9000000009]_[22-10-2010].pdf", + "VirusScannerResult": "Clean", + }, + ], + "Count": 2, + "ScannedCount": 2, +} + +EXPECTED_ITEMS_FOR_PAGINATED_RESULTS = flatten( + [ + response["Items"] + for response in [ + MOCK_PAGINATED_RESPONSE_1, + MOCK_PAGINATED_RESPONSE_2, + MOCK_PAGINATED_RESPONSE_3, + ] + ] +) diff --git a/lambdas/tests/unit/helpers/data/s3_responses.py b/lambdas/tests/unit/helpers/data/s3_responses.py index 5a385de84..fb8150626 100644 --- a/lambdas/tests/unit/helpers/data/s3_responses.py +++ b/lambdas/tests/unit/helpers/data/s3_responses.py @@ -10,3 +10,75 @@ "x-amz-signature": "test-signature", }, } + +MOCK_LIST_OBJECTS_RESPONSE = { + "ResponseMetadata": { + "RequestId": "abc", + "HostId": "xyz", + "HTTPStatusCode": 200, + "HTTPHeaders": { + "x-amz-id-2": "efg", + "x-amz-request-id": "HIJ", + "date": "Wed, 04 Jun 2024 13:49:45 GMT", + "x-amz-bucket-region": "eu-west-2", + "content-type": "application/xml", + "transfer-encoding": "chunked", + "server": "AmazonS3", + }, + "RetryAttempts": 0, + }, + "IsTruncated": False, + "Contents": [ + { + "Key": "9000000009/2985a5dd-37ac-481a-b847-ee09e4b0817b", + "ETag": '"ddeafe0237ac7cb097c9a34c0e21a8a9"', + "Size": 928, + "StorageClass": "STANDARD", + }, + { + "Key": "9000000009/384b886d-bd86-4211-9f43-73f7146fbb9b", + "ETag": '"ddeafe0237ac7cb097c9a34c0e21a8a9"', + "Size": 928, + "StorageClass": "STANDARD", + }, + ], + "Name": "test-lg-bucket", + "Prefix": "", + "MaxKeys": 1000, + "EncodingType": "url", + "KeyCount": 2, +} + + +MOCK_LIST_OBJECTS_PAGINATED_RESPONSES = [ + { + "IsTruncated": True, + "Contents": [ + { + "Key": "9000000009/2985a5dd-37ac-481a-b847-ee09e4b0817b", + "Size": 928, + "StorageClass": "STANDARD", + }, + { + "Key": "9000000009/384b886d-bd86-4211-9f43-73f7146fbb9b", + "Size": 928, + "StorageClass": "STANDARD", + }, + ], + }, + { + "IsTruncated": True, + "Contents": [ + { + "Key": "9000000009/94c93a0c-a322-4eaa-ad0b-29ea876c33a5", + "Size": 928, + "StorageClass": "STANDARD", + }, + { + "Key": "9000000009/36af7807-7965-4c17-b2eb-0f2ae903196d", + "Size": 928, + "StorageClass": "STANDARD", + }, + ], + }, +] diff --git a/lambdas/tests/unit/helpers/data/statistic/__init__.py b/lambdas/tests/unit/helpers/data/statistic/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_collected_data.py b/lambdas/tests/unit/helpers/data/statistic/mock_collected_data.py new file mode 100644 index 000000000..0553358bf --- /dev/null +++ b/lambdas/tests/unit/helpers/data/statistic/mock_collected_data.py @@ -0,0 +1,80 @@ +from datetime import datetime + +from models.statistics import ApplicationData, OrganisationData, RecordStoreData +from tests.unit.helpers.data.statistic.mock_dynamodb_and_s3_records import ( + TOTAL_FILE_SIZE_FOR_H81109, + TOTAL_FILE_SIZE_FOR_Y12345, +) +from tests.unit.helpers.data.statistic.mock_logs_query_results import ( + HASHED_USER_ID_1, + HASHED_USER_ID_2, +) + +TODAY_DATE = datetime.today().strftime("%Y%m%d") + +MOCK_RECORD_STORE_DATA = [ + RecordStoreData( + statistic_id="mock_uuid", + date=TODAY_DATE, + ods_code="H81109", + total_number_of_records=6, + number_of_document_types=2, + total_size_of_records_in_megabytes=TOTAL_FILE_SIZE_FOR_H81109, + average_size_of_documents_per_patient_in_megabytes=TOTAL_FILE_SIZE_FOR_H81109 + / 2, + ), + RecordStoreData( + statistic_id="mock_uuid", + date=TODAY_DATE, + ods_code="Y12345", + total_number_of_records=2, + number_of_document_types=2, + total_size_of_records_in_megabytes=TOTAL_FILE_SIZE_FOR_Y12345, + average_size_of_documents_per_patient_in_megabytes=TOTAL_FILE_SIZE_FOR_Y12345, + ), +] + +MOCK_ORGANISATION_DATA = [ + OrganisationData( + statistic_id="mock_uuid", + date=TODAY_DATE, + ods_code="H81109", + number_of_patients=2, + average_records_per_patient=3, + daily_count_stored=4, + daily_count_viewed=40, + daily_count_downloaded=20, + daily_count_deleted=2, + ), + OrganisationData( + statistic_id="mock_uuid", + date=TODAY_DATE, + ods_code="Y12345", + number_of_patients=1, + average_records_per_patient=2, + daily_count_stored=2, + daily_count_viewed=20, + daily_count_downloaded=10, + daily_count_deleted=1, + ), +] + +MOCK_APPLICATION_DATA = [ + ApplicationData( + statistic_id="mock_uuid", + date=TODAY_DATE, + ods_code="H81109", + active_user_ids_hashed=[HASHED_USER_ID_1, HASHED_USER_ID_2], + ), + ApplicationData( + statistic_id="mock_uuid", + date=TODAY_DATE, + ods_code="Y12345", + active_user_ids_hashed=[HASHED_USER_ID_1], + ), +] + +ALL_MOCK_DATA = MOCK_RECORD_STORE_DATA + MOCK_ORGANISATION_DATA + MOCK_APPLICATION_DATA +ALL_MOCK_DATA_AS_JSON_LIST = list( + map(lambda data: data.model_dump(by_alias=True), ALL_MOCK_DATA) +) diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_data_build_utils.py b/lambdas/tests/unit/helpers/data/statistic/mock_data_build_utils.py new file mode 100644 index 000000000..cd5f38ded --- /dev/null +++ b/lambdas/tests/unit/helpers/data/statistic/mock_data_build_utils.py @@ -0,0 +1,57 @@ +import random + +from models.statistics import ApplicationData, OrganisationData, RecordStoreData + + +def make_random_data( + ods_code: str, + date_range: list[str], + field_names: set[str], +) -> list[dict]: + results = [] + for date in date_range: + random_data: dict[str, str | int] = { + key: random.randint(0, 1000) for key in field_names + } + random_data.update({"date": date, "ods_code": ods_code}) + results.append(random_data) + return results + + +def build_random_record_store_data( + ods_code: str, date_range: list[str] +) -> list[RecordStoreData]: + field_names = set(RecordStoreData.model_fields.keys()) - { + "ods_code", + "date", + "statistic_id", + } + all_random_data = make_random_data(ods_code, date_range, field_names) + return [RecordStoreData(**data) for data in all_random_data] + + +def build_random_organisation_data( + ods_code: str, date_range: list[str] +) -> list[OrganisationData]: + field_names = set(OrganisationData.model_fields.keys()) - { + "ods_code", + "date", + "statistic_id", + } + all_random_data = make_random_data(ods_code, date_range, field_names) + return [OrganisationData(**data) for data in all_random_data] + + +def build_random_application_data( + ods_code: str, date_range: list[str] +) -> list[ApplicationData]: + result = [] + for date in date_range: + random_numbers = random.sample(range(30), k=random.randint(1, 10)) + hashed_user_ids = [f"userid_{number}" for number in random_numbers] + application_data = ApplicationData( + ods_code=ods_code, date=date, active_user_ids_hashed=hashed_user_ids + ) + result.append(application_data) + + return result diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_dynamodb_and_s3_records.py b/lambdas/tests/unit/helpers/data/statistic/mock_dynamodb_and_s3_records.py new file mode 100644 index 000000000..e1f22d551 --- /dev/null +++ b/lambdas/tests/unit/helpers/data/statistic/mock_dynamodb_and_s3_records.py @@ -0,0 +1,127 @@ +import math +import random +import uuid +from typing import Tuple + +PDF_MIME_TYPE = "application/pdf" + + +def build_mock_results( + ods_code: str, + nhs_number: str, + number_of_files: int = 3, + total_file_size_in_mb: int = 5, +) -> Tuple[list[dict], list[dict]]: + file_ids = [uuid.uuid4() for _ in range(number_of_files)] + file_size_randoms = [random.randint(1, 10) for _ in range(number_of_files)] + total_file_size = total_file_size_in_mb * 1024 * 1024 + file_sizes = [ + math.floor(x * total_file_size / sum(file_size_randoms)) + for x in file_size_randoms + ] + file_sizes[-1] = total_file_size - sum(file_sizes[:-1]) + + dynamodb_scan_result = [ + { + "CurrentGpOds": ods_code, + "ContentType": PDF_MIME_TYPE, + "FileLocation": f"s3://test-lg-table/{nhs_number}/{file_id}", + "NhsNumber": nhs_number, + } + for file_id in file_ids + ] + + s3_list_objects_result = [ + { + "Key": f"{nhs_number}/{file_ids[index]}", + "Size": file_sizes[index], + } + for index in range(number_of_files) + ] + + return dynamodb_scan_result, s3_list_objects_result + + +MOCK_LG_SCAN_RESULT = [ + { + "CurrentGpOds": "H81109", + "ContentType": PDF_MIME_TYPE, + "FileLocation": "s3://test-lg-table/9000000009/4ac21f7b-abd6-46c9-bf55-e7bafccba2ab", + "NhsNumber": "9000000009", + }, + { + "CurrentGpOds": "H81109", + "ContentType": PDF_MIME_TYPE, + "FileLocation": "s3://test-lg-table/9000000009/95764e53-b5fd-47b8-8756-1c5604121444", + "NhsNumber": "9000000009", + }, + { + "CurrentGpOds": "H81109", + "ContentType": PDF_MIME_TYPE, + "FileLocation": "s3://test-lg-table/9000000009/0ab18243-783a-4044-8146-b5b0996d8422", + "NhsNumber": "9000000009", + }, + { + "CurrentGpOds": "H81109", + "ContentType": PDF_MIME_TYPE, + "FileLocation": "s3://test-lg-table/9000000001/5d5b3c28-e6c8-4d46-8ae3-a2b97321bcf8", + "NhsNumber": "9000000001", + }, + { + "CurrentGpOds": "H81109", + "ContentType": PDF_MIME_TYPE, + "FileLocation": "s3://test-lg-table/9000000001/2543ba87-dcdb-4583-bae2-e83c4ba7af34", + "NhsNumber": "9000000001", + }, +] + +MOCK_ARF_SCAN_RESULT = [ + { + "CurrentGpOds": "Y12345", + "ContentType": "application/msword", + "FileLocation": "s3://test-arf-table/9000000005/beec3523-1428-4c5f-b718-6ef25a4db1b9", + "NhsNumber": "9000000005", + }, + { + "CurrentGpOds": "Y12345", + "ContentType": "image/jpeg", + "FileLocation": "s3://test-arf-table/9000000005/d2cf885b-9e78-4c29-bf5a-2a56e5e0df8f", + "NhsNumber": "9000000005", + }, + { + "CurrentGpOds": "H81109", + "ContentType": "image/bmp", + "FileLocation": "s3://test-arf-table/9000000009/71e0c54e-5cfc-4260-a538-09eab185a6ed", + "NhsNumber": "9000000009", + }, +] + +MB = 1024 * 1024 +MOCK_LG_LIST_OBJECTS_RESULT = [ + {"Key": "9000000009/4ac21f7b-abd6-46c9-bf55-e7bafccba2ab", "Size": 1 * MB}, + {"Key": "9000000009/95764e53-b5fd-47b8-8756-1c5604121444", "Size": 2 * MB}, + {"Key": "9000000009/0ab18243-783a-4044-8146-b5b0996d8422", "Size": 3 * MB}, + {"Key": "9000000001/5d5b3c28-e6c8-4d46-8ae3-a2b97321bcf8", "Size": 4 * MB}, + {"Key": "9000000001/2543ba87-dcdb-4583-bae2-e83c4ba7af34", "Size": 5 * MB}, +] +MOCK_ARF_LIST_OBJECTS_RESULT = [ + { + "Key": "9000000005/beec3523-1428-4c5f-b718-6ef25a4db1b9", + "Size": 1 * MB, + }, + { + "Key": "9000000005/d2cf885b-9e78-4c29-bf5a-2a56e5e0df8f", + "Size": 2 * MB, + }, + { + "Key": "9000000009/71e0c54e-5cfc-4260-a538-09eab185a6ed", + "Size": 6 * MB, + }, +] +TOTAL_FILE_SIZE_FOR_9000000001 = 4 + 5 +TOTAL_FILE_SIZE_FOR_9000000009 = 1 + 2 + 3 + 6 +TOTAL_FILE_SIZE_FOR_9000000005 = 1 + 2 +TOTAL_FILE_SIZE_FOR_H81109 = ( + TOTAL_FILE_SIZE_FOR_9000000001 + TOTAL_FILE_SIZE_FOR_9000000009 +) +TOTAL_FILE_SIZE_FOR_Y12345 = TOTAL_FILE_SIZE_FOR_9000000005 diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_logs_query_results.py b/lambdas/tests/unit/helpers/data/statistic/mock_logs_query_results.py new file mode 100644 index 000000000..8e5049b52 --- /dev/null +++ b/lambdas/tests/unit/helpers/data/statistic/mock_logs_query_results.py @@ -0,0 +1,98 @@ +USER_ID_1 = "F4A6AF98-4800-4A8A-A6C0-8FE0AC4B994B" +USER_ID_2 = "9E7F1235-3DF1-4822-AFFB-C4FCC88C2690" +HASHED_USER_ID_1 = "3192b6cf7ef953cf1a1f0945a83b55ab2cb8bae95cac6548ae5412aaa4c67677" +HASHED_USER_ID_2 = "a89d1cb4ac0776e45131c65a69e8b1a48026e9b497c94409e480588418a016e4" + +MOCK_UNIQUE_ACTIVE_USER_IDS = [ + { + "ods_code": "Y12345", + "user_id": USER_ID_1, + }, + { + "ods_code": "H81109", + "user_id": USER_ID_1, + }, + { + "ods_code": "H81109", + "user_id": USER_ID_2, + }, +] + + +MOCK_LG_VIEWED = [ + { + "ods_code": "Y12345", + "daily_count_viewed": "20", + }, + { + "ods_code": "H81109", + "daily_count_viewed": "40", + }, +] + +MOCK_LG_DOWNLOADED = [ + { + "ods_code": "Y12345", + "daily_count_downloaded": "10", + }, + { + "ods_code": "H81109", + "daily_count_downloaded": "20", + }, +] + +MOCK_LG_DELETED = [ + { + "ods_code": "Y12345", + "daily_count_deleted": "1", + }, + { + "ods_code": "H81109", + "daily_count_deleted": "2", + }, +] + +MOCK_LG_STORED = [ + { + "ods_code": "Y12345", + "daily_count_stored": "2", + }, + { + "ods_code": "H81109", + "daily_count_stored": "4", + }, +] + +MOCK_RESPONSE_QUERY_IN_PROGRESS = {"status": "Running"} + +MOCK_RESPONSE_QUERY_FAILED = {"status": "Failed"} + +MOCK_RESPONSE_QUERY_COMPLETE = { + "results": [ + [ + {"field": "ods_code", "value": "Y12345"}, + {"field": "daily_count_viewed", "value": "20"}, + ], + [ + {"field": "ods_code", "value": "H81109"}, + {"field": "daily_count_viewed", "value": "40"}, + ], + ], + "statistics": { + "recordsMatched": 123.0, + "recordsScanned": 123.0, + "bytesScanned": 123.0, + }, + "status": "Complete", +} + +EXPECTED_QUERY_RESULT = [ + { + "ods_code": "Y12345", + "daily_count_viewed": "20", + }, + { + "ods_code": "H81109", + "daily_count_viewed": "40", + }, +] diff --git a/lambdas/tests/unit/helpers/data/statistic/mock_statistic_data.py b/lambdas/tests/unit/helpers/data/statistic/mock_statistic_data.py new file mode 100644 index 000000000..eaa91162a --- /dev/null +++ b/lambdas/tests/unit/helpers/data/statistic/mock_statistic_data.py @@ -0,0 +1,298 @@ +from decimal import Decimal + +import polars as pl +from models.statistics import ApplicationData, OrganisationData, RecordStoreData + +MOCK_RECORD_STORE_DATA_1 = RecordStoreData( + statistic_id="e02ec4db-8a7d-4f84-a4b3-875a526b37d4", + date="20240510", + ods_code="Z56789", + total_number_of_records=18, + number_of_document_types=1, + total_size_of_records_in_megabytes=Decimal("1.75"), + average_size_of_documents_per_patient_in_megabytes=Decimal("1.5"), +) + +MOCK_RECORD_STORE_DATA_2 = RecordStoreData( + statistic_id="974b1ca0-8e5e-4d12-9673-93050f0fee71", + date="20240510", + ods_code="Y12345", + total_number_of_records=25, + number_of_document_types=1, + total_size_of_records_in_megabytes=Decimal("1.23"), + average_size_of_documents_per_patient_in_megabytes=Decimal("0.5"), +) + +MOCK_RECORD_STORE_DATA_3 = RecordStoreData( + statistic_id="c2841ca0-8e5e-4d12-9673-93050f0fee71", + date="20240511", + ods_code="Y12345", + total_number_of_records=20, + number_of_document_types=2, + total_size_of_records_in_megabytes=Decimal("2.34"), + average_size_of_documents_per_patient_in_megabytes=Decimal("0.6"), +) + +EXPECTED_SUMMARY_RECORD_STORE_DATA = pl.DataFrame( + [ + { + "ods_code": "Z56789", + "total_number_of_records": 18, + "number_of_document_types": 1, + "total_size_of_records_in_megabytes": 1.75, + "average_size_of_documents_per_patient_in_megabytes": 1.5, + }, + { + "ods_code": "Y12345", + "total_number_of_records": 20, + "number_of_document_types": 2, + "total_size_of_records_in_megabytes": 2.34, + "average_size_of_documents_per_patient_in_megabytes": 0.6, + }, + ] +) + +SERIALISED_RECORD_STORE_DATA = [ + { + "TotalSizeOfRecordsInMegabytes": Decimal("1.75"), + "AverageSizeOfDocumentsPerPatientInMegabytes": Decimal("1.5"), + "Date": "20240510", + "TotalNumberOfRecords": 18, + "NumberOfDocumentTypes": 1, + "OdsCode": "Z56789", + "StatisticID": "RecordStoreData#e02ec4db-8a7d-4f84-a4b3-875a526b37d4", + }, + { + "TotalSizeOfRecordsInMegabytes": Decimal("1.23"), + "AverageSizeOfDocumentsPerPatientInMegabytes": Decimal("0.5"), + "Date": "20240510", + "TotalNumberOfRecords": 25, + "NumberOfDocumentTypes": 1, + "OdsCode": "Y12345", + "StatisticID": "RecordStoreData#974b1ca0-8e5e-4d12-9673-93050f0fee71", + }, + { + "TotalSizeOfRecordsInMegabytes": Decimal("2.34"), + "AverageSizeOfDocumentsPerPatientInMegabytes": Decimal("0.6"), + "Date": "20240511", + "TotalNumberOfRecords": 20, + "NumberOfDocumentTypes": 2, + "OdsCode": "Y12345", + "StatisticID": "RecordStoreData#c2841ca0-8e5e-4d12-9673-93050f0fee71", + }, +] + +MOCK_ORGANISATION_DATA_1 = OrganisationData( + statistic_id="5acda4bf-8b93-4ba0-8410-789aac4fcbae", + date="20240510", + ods_code="Z56789", + number_of_patients=4, + average_records_per_patient=Decimal("4.5"), + daily_count_stored=0, + daily_count_viewed=35, + daily_count_downloaded=4, + daily_count_deleted=1, +) +MOCK_ORGANISATION_DATA_2 = OrganisationData( + statistic_id="9ee2c3d1-97b9-4c34-b75c-83e7d1b442f4", + date="20240510", + ods_code="Y12345", + number_of_patients=9, + average_records_per_patient=Decimal("2.78"), + daily_count_stored=0, + daily_count_viewed=15, + daily_count_downloaded=1, + daily_count_deleted=1, +) +MOCK_ORGANISATION_DATA_3 = OrganisationData( + statistic_id="3f54cfe3-6c84-4bb2-b5b4-b786aa03b9c7", + date="20240511", + ods_code="Y12345", + number_of_patients=10, + average_records_per_patient=Decimal("3.51"), + daily_count_stored=2, + daily_count_viewed=30, + daily_count_downloaded=5, + daily_count_deleted=1, +) + +EXPECTED_SUMMARY_ORGANISATION_DATA = pl.DataFrame( + [ + { + "ods_code": "Z56789", + "weekly_count_stored": 0, + "weekly_count_viewed": 35, + "weekly_count_downloaded": 4, + "weekly_count_deleted": 1, + "average_records_per_patient": 4.5, + "number_of_patients": 4, + }, + { + "ods_code": "Y12345", + "weekly_count_stored": 0 + 2, + "weekly_count_viewed": 15 + 30, + "weekly_count_downloaded": 1 + 5, + "weekly_count_deleted": 1 + 1, + "average_records_per_patient": (3.51 + 2.78) / 2, + "number_of_patients": 10, + }, + ] +) + + +SERIALISED_ORGANISATION_DATA = [ + { + "Date": "20240510", + "OdsCode": "Z56789", + "NumberOfPatients": 4, + "AverageRecordsPerPatient": Decimal("4.5"), + "DailyCountStored": 0, + "DailyCountViewed": 35, + "DailyCountDownloaded": 4, + "DailyCountDeleted": 1, + "StatisticID": "OrganisationData#5acda4bf-8b93-4ba0-8410-789aac4fcbae", + }, + { + "Date": "20240510", + "OdsCode": "Y12345", + "NumberOfPatients": 9, + "AverageRecordsPerPatient": Decimal("2.78"), + "DailyCountStored": 0, + "DailyCountViewed": 15, + "DailyCountDownloaded": 1, + "DailyCountDeleted": 1, + "StatisticID": "OrganisationData#9ee2c3d1-97b9-4c34-b75c-83e7d1b442f4", + }, + { + "Date": "20240511", + "OdsCode": "Y12345", + "NumberOfPatients": 10, + "AverageRecordsPerPatient": Decimal("3.51"), + "DailyCountStored": 2, + "DailyCountViewed": 30, + "DailyCountDownloaded": 5, + "DailyCountDeleted": 1, + "StatisticID": "OrganisationData#3f54cfe3-6c84-4bb2-b5b4-b786aa03b9c7", + }, +] + +MOCK_APPLICATION_DATA_1 = ApplicationData( + statistic_id="65ee0add-41ca-4b71-a6d2-63e309bed920", + date="20240510", + ods_code="Z56789", + active_user_ids_hashed=[ + "zf1af742e351ce63d8ed275d4bec8d8f", + ], +) +MOCK_APPLICATION_DATA_2 = ApplicationData( + statistic_id="12d92f26-47c3-452c-923b-819cfcc27c79", + date="20240510", + ods_code="Y12345", + active_user_ids_hashed=[ + "a873620d0b476b13ee571a28cc315870", + "ba81803adac3c816b6cbaf67bf33022a", + ], +) +MOCK_APPLICATION_DATA_3 = ApplicationData( + statistic_id="d495959f-93dc-4f05-a869-43d8711ca120", + date="20240511", + ods_code="Y12345", + active_user_ids_hashed=[ + "a873620d0b476b13ee571a28cc315870", + "cf1af742e351ce63d8ed275d4bec8d8f", + ], +) + +EXPECTED_SUMMARY_APPLICATION_DATA = pl.DataFrame( + [ + {"ods_code": "Z56789", "active_users_count": 1}, + {"ods_code": "Y12345", "active_users_count": 3}, + ], +) + +SERIALISED_APPLICATION_DATA = [ + { + "Date": "20240510", + "OdsCode": "Z56789", + "StatisticID": "ApplicationData#65ee0add-41ca-4b71-a6d2-63e309bed920", + "ActiveUserIdsHashed": [ + "zf1af742e351ce63d8ed275d4bec8d8f", + ], + }, + { + "Date": "20240510", + "OdsCode": "Y12345", + "StatisticID": "ApplicationData#12d92f26-47c3-452c-923b-819cfcc27c79", + "ActiveUserIdsHashed": [ + "a873620d0b476b13ee571a28cc315870", + "ba81803adac3c816b6cbaf67bf33022a", + ], + }, + { + "Date": "20240511", + "OdsCode": "Y12345", + "StatisticID": "ApplicationData#d495959f-93dc-4f05-a869-43d8711ca120", + "ActiveUserIdsHashed": [ + "a873620d0b476b13ee571a28cc315870", + "cf1af742e351ce63d8ed275d4bec8d8f", + ], + }, +] + +ALL_MOCKED_STATISTIC_DATA = ( + [MOCK_RECORD_STORE_DATA_1, MOCK_RECORD_STORE_DATA_2, MOCK_RECORD_STORE_DATA_3], + [MOCK_ORGANISATION_DATA_1, MOCK_ORGANISATION_DATA_2, MOCK_ORGANISATION_DATA_3], + [MOCK_APPLICATION_DATA_1, MOCK_APPLICATION_DATA_2, MOCK_APPLICATION_DATA_3], +) + +ALL_SUMMARY_DATA = [ + EXPECTED_SUMMARY_RECORD_STORE_DATA, + EXPECTED_SUMMARY_ORGANISATION_DATA, + EXPECTED_SUMMARY_APPLICATION_DATA, +] + +MOCK_DYNAMODB_ITEMS = ( + SERIALISED_APPLICATION_DATA + + SERIALISED_ORGANISATION_DATA + + SERIALISED_RECORD_STORE_DATA +) + +MOCK_DYNAMODB_QUERY_RESPONSE = [ + {"Items": [item for item in MOCK_DYNAMODB_ITEMS if item["Date"] == "20240510"]}, + {"Items": [item for item in MOCK_DYNAMODB_ITEMS if item["Date"] == "20240511"]}, +] + +EXPECTED_WEEKLY_SUMMARY = pl.DataFrame( + [ + { + "Date": "20240505-20240511", + "ODS code": "Z56789", + "Active users count": 1, + "Average records per patient": 4.5, + "Average size of documents per patient in megabytes": 1.5, + "Number of document types": 1, + "Number of patients": 4, + "Total number of records": 18, + "Total size of records in megabytes": 1.75, + "Weekly count deleted": 1, + "Weekly count downloaded": 4, + "Weekly count stored": 0, + "Weekly count viewed": 35, + }, + { + "Date": "20240505-20240511", + "ODS code": "Y12345", + "Active users count": 3, + "Average records per patient": (2.78 + 3.51) / 2, + "Average size of documents per patient in megabytes": 0.6, + "Number of document types": 2, + "Number of patients": 10, + "Total number of records": 20, + "Total size of records in megabytes": 2.34, + "Weekly count deleted": 1 + 1, + "Weekly count downloaded": 1 + 5, + "Weekly count stored": 0 + 2, + "Weekly count viewed": 15 + 30, + }, + ] +) diff --git a/lambdas/tests/unit/models/test_statistics_models.py b/lambdas/tests/unit/models/test_statistics_models.py new file mode 100644 index 000000000..b6a744448 --- /dev/null +++ b/lambdas/tests/unit/models/test_statistics_models.py @@ -0,0 +1,63 @@ +import pydantic +import pytest +from models.statistics import ApplicationData, RecordStoreData, load_from_dynamodb_items +from tests.unit.helpers.data.statistic.mock_statistic_data import ( + MOCK_APPLICATION_DATA_1, + MOCK_APPLICATION_DATA_2, + MOCK_APPLICATION_DATA_3, + MOCK_DYNAMODB_ITEMS, + MOCK_ORGANISATION_DATA_1, + MOCK_ORGANISATION_DATA_2, + MOCK_ORGANISATION_DATA_3, + MOCK_RECORD_STORE_DATA_1, + MOCK_RECORD_STORE_DATA_2, + MOCK_RECORD_STORE_DATA_3, + SERIALISED_RECORD_STORE_DATA, +) + + +def test_serialise_and_deserialise_record_store_data(mocker): + mocker.patch("uuid.uuid4", return_value="test_uuid") + + test_data = MOCK_RECORD_STORE_DATA_1 + + output = test_data.model_dump(by_alias=True) + expected = SERIALISED_RECORD_STORE_DATA[0] + assert output == expected + + load_from_deserialised = RecordStoreData.model_validate(output) + assert test_data == load_from_deserialised + + +def test_empty_ods_code_will_be_filled_with_an_empty_value(): + data = RecordStoreData(date="20240516", ods_code="") + assert data.ods_code == "NO_ODS_CODE" + + +def test_validation_error_raised_when_try_to_deserialise_to_wrong_type(): + test_data = SERIALISED_RECORD_STORE_DATA[0] + + with pytest.raises(pydantic.ValidationError) as e: + ApplicationData.model_validate(test_data) + + assert "StatisticID must be in the form of `ApplicationData#uuid" in str(e.value) + + +def test_load_from_dynamodb_items(): + deserialised_data = load_from_dynamodb_items(MOCK_DYNAMODB_ITEMS) + + assert deserialised_data.record_store_data == [ + MOCK_RECORD_STORE_DATA_1, + MOCK_RECORD_STORE_DATA_2, + MOCK_RECORD_STORE_DATA_3, + ] + assert deserialised_data.organisation_data == [ + MOCK_ORGANISATION_DATA_1, + MOCK_ORGANISATION_DATA_2, + MOCK_ORGANISATION_DATA_3, + ] + assert deserialised_data.application_data == [ + MOCK_APPLICATION_DATA_1, + MOCK_APPLICATION_DATA_2, + MOCK_APPLICATION_DATA_3, + ] diff --git a/lambdas/tests/unit/services/base/test_cloudwatch_logs_query_service.py b/lambdas/tests/unit/services/base/test_cloudwatch_logs_query_service.py new file mode 100644 index 000000000..ceed28490 --- /dev/null +++ b/lambdas/tests/unit/services/base/test_cloudwatch_logs_query_service.py @@ -0,0 +1,133 @@ +import pytest +from services.base.cloudwatch_service import CloudwatchService +from tests.unit.conftest import WORKSPACE +from tests.unit.helpers.data.statistic.mock_logs_query_results import ( + EXPECTED_QUERY_RESULT, + MOCK_RESPONSE_QUERY_COMPLETE, + MOCK_RESPONSE_QUERY_FAILED, + MOCK_RESPONSE_QUERY_IN_PROGRESS, +) +from utils.cloudwatch_logs_query import CloudwatchLogsQueryParams +from utils.exceptions import LogsQueryException + +MOCK_QUERY_ID = "mock_query_id" +MOCK_LAMBDA_NAME = "mock-lambda" +MOCK_QUERY_STRING = """ + fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code + | filter Message = 'User has viewed Lloyd George records' + | stats count() AS daily_count_viewed BY ods_code + """ +MOCK_QUERY_PARAMS = CloudwatchLogsQueryParams(MOCK_LAMBDA_NAME, MOCK_QUERY_STRING) +MOCK_START_TIME = 1717667304 +MOCK_END_TIME = 171777304 + + +@pytest.fixture +def mock_service(set_env, mock_logs_client, patch_sleep): + service = CloudwatchService() + yield service + + +@pytest.fixture +def patch_sleep(mocker): + mocker.patch("time.sleep") + + +@pytest.fixture +def mock_logs_client(mocker): + mock_instance = mocker.patch("boto3.client").return_value + yield mock_instance + + +def test_query_logs(mock_logs_client, mock_service): + mock_logs_client.start_query.return_value = {"queryId": MOCK_QUERY_ID} + mock_logs_client.get_query_results.side_effect = [ + MOCK_RESPONSE_QUERY_IN_PROGRESS, + MOCK_RESPONSE_QUERY_IN_PROGRESS, + MOCK_RESPONSE_QUERY_COMPLETE, + ] + expected = EXPECTED_QUERY_RESULT + expected_start_query_call = { + "logGroupName": f"/aws/lambda/{WORKSPACE}_{MOCK_QUERY_PARAMS.lambda_name}", + "startTime": MOCK_START_TIME, + "endTime": MOCK_END_TIME, + "queryString": MOCK_QUERY_PARAMS.query_string, + } + + actual = mock_service.query_logs( + query_params=MOCK_QUERY_PARAMS, + start_time=MOCK_START_TIME, + end_time=MOCK_END_TIME, + ) + + assert actual == expected + + mock_logs_client.start_query.assert_called_with(**expected_start_query_call) + mock_logs_client.get_query_results.assert_called_with(queryId=MOCK_QUERY_ID) + + +def test_poll_query_result_poll_result_until_complete(mock_logs_client, mock_service): + mock_logs_client.get_query_results.side_effect = [ + MOCK_RESPONSE_QUERY_IN_PROGRESS, + MOCK_RESPONSE_QUERY_IN_PROGRESS, + MOCK_RESPONSE_QUERY_COMPLETE, + ] + + actual = mock_service.poll_query_result(MOCK_QUERY_ID) + expected = MOCK_RESPONSE_QUERY_COMPLETE["results"] + + assert actual == expected + + mock_logs_client.get_query_results.assert_called_with(queryId=MOCK_QUERY_ID) + assert mock_logs_client.get_query_results.call_count == 3 + + +def test_poll_query_result_raise_error_when_exceed_max_retries( + mock_logs_client, mock_service +): + mock_logs_client.get_query_results.return_value = MOCK_RESPONSE_QUERY_IN_PROGRESS + + with pytest.raises(LogsQueryException): + mock_service.poll_query_result(query_id=MOCK_QUERY_ID, max_retries=20) + assert mock_logs_client.get_query_results.call_count == 20 + + +def test_poll_query_result_raise_error_when_query_failed( + mock_logs_client, mock_service +): + mock_logs_client.get_query_results.side_effect = [ + MOCK_RESPONSE_QUERY_IN_PROGRESS, + MOCK_RESPONSE_QUERY_IN_PROGRESS, + MOCK_RESPONSE_QUERY_FAILED, + ] + + with pytest.raises(LogsQueryException): + mock_service.poll_query_result(MOCK_QUERY_ID) + assert mock_logs_client.get_query_results.call_count == 3 + + +def test_regroup_raw_query_result(mock_service): + raw_query_result = [ + [ + {"field": "ods_code", "value": "Y12345"}, + {"field": "daily_count_viewed", "value": "20"}, + ], + [ + {"field": "ods_code", "value": "H81109"}, + {"field": "daily_count_viewed", "value": "40"}, + ], + ] + expected = [ + { + "ods_code": "Y12345", + "daily_count_viewed": "20", + }, + { + "ods_code": "H81109", + "daily_count_viewed": "40", + }, + ] + + actual = mock_service.regroup_raw_query_result(raw_query_result) + + assert actual == expected diff --git a/lambdas/tests/unit/services/base/test_dynamo_service.py b/lambdas/tests/unit/services/base/test_dynamo_service.py index 1cca255cc..3ac940e64 100755 --- a/lambdas/tests/unit/services/base/test_dynamo_service.py +++ b/lambdas/tests/unit/services/base/test_dynamo_service.py @@ -1,3 +1,6 @@ +from typing import Optional +from unittest.mock import call + import pytest from boto3.dynamodb.conditions import Attr, Key from botocore.exceptions import ClientError @@ -6,6 +9,13 @@ from services.base.dynamo_service import DynamoDBService from tests.unit.conftest import MOCK_TABLE_NAME, TEST_NHS_NUMBER from tests.unit.helpers.data.dynamo_responses import MOCK_SEARCH_RESPONSE +from tests.unit.helpers.data.dynamo_scan_response import ( + EXPECTED_ITEMS_FOR_PAGINATED_RESULTS, + MOCK_PAGINATED_RESPONSE_1, + MOCK_PAGINATED_RESPONSE_2, + MOCK_PAGINATED_RESPONSE_3, + MOCK_RESPONSE, +) from utils.dynamo_query_filter_builder import DynamoQueryFilterBuilder from utils.exceptions import DynamoServiceException @@ -32,6 +42,13 @@ def mock_table(mocker, mock_service): yield mocker.patch.object(mock_service, "get_table") +@pytest.fixture +def mock_scan_method(mock_table): + table_instance = mock_table.return_value + scan_method = table_instance.scan + yield scan_method + + @pytest.fixture def mock_filter_expression(): filter_builder = DynamoQueryFilterBuilder() @@ -163,13 +180,13 @@ def test_query_with_requested_fields_client_error_raises_exception( assert expected_response == actual_response.value -def test_simple_query_is_called_with_correct_parameters(mock_service, mock_table): +def test_query_all_fields_is_called_with_correct_parameters(mock_service, mock_table): mock_table.return_value.query.return_value = { "Items": [{"id": "fake_test_item"}], "Counts": 1, } - mock_service.simple_query(MOCK_TABLE_NAME, "test_key_condition_expression") + mock_service.query_all_fields(MOCK_TABLE_NAME, "test_key_condition_expression") mock_table.assert_called_with(MOCK_TABLE_NAME) mock_table.return_value.query.assert_called_once_with( @@ -177,11 +194,13 @@ def test_simple_query_is_called_with_correct_parameters(mock_service, mock_table ) -def test_simple_query_raises_exception_when_results_are_empty(mock_service, mock_table): +def test_query_all_fields_raises_exception_when_results_are_empty( + mock_service, mock_table +): mock_table.return_value.query.return_value = [] with pytest.raises(DynamoServiceException): - mock_service.simple_query(MOCK_TABLE_NAME, "test_key_condition_expression") + mock_service.query_all_fields(MOCK_TABLE_NAME, "test_key_condition_expression") mock_table.assert_called_with(MOCK_TABLE_NAME) mock_table.return_value.query.assert_called_once_with( @@ -189,12 +208,12 @@ def test_simple_query_raises_exception_when_results_are_empty(mock_service, mock ) -def test_simple_query_client_error_raises_exception(mock_service, mock_table): +def test_query_all_fields_client_error_raises_exception(mock_service, mock_table): expected_response = MOCK_CLIENT_ERROR mock_table.return_value.query.side_effect = MOCK_CLIENT_ERROR with pytest.raises(ClientError) as actual_response: - mock_service.simple_query(MOCK_TABLE_NAME, "test_key_condition_expression") + mock_service.query_all_fields(MOCK_TABLE_NAME, "test_key_condition_expression") assert expected_response == actual_response.value @@ -346,6 +365,85 @@ def test_scan_table_client_error_raises_exception(mock_service, mock_table): assert expected_response == actual_response.value +def test_scan_whole_table_return_items_in_response( + mock_service, mock_scan_method, mock_filter_expression +): + mock_project_expression = "mock_project_expression" + mock_scan_method.return_value = MOCK_RESPONSE + + expected = MOCK_RESPONSE["Items"] + actual = mock_service.scan_whole_table( + table_name=MOCK_TABLE_NAME, + project_expression=mock_project_expression, + filter_expression=mock_filter_expression, + ) + + assert expected == actual + + mock_service.get_table.assert_called_with(MOCK_TABLE_NAME) + mock_scan_method.assert_called_with( + ProjectionExpression=mock_project_expression, + FilterExpression=mock_filter_expression, + ) + + +def test_scan_whole_table_handles_pagination( + mock_service, mock_scan_method, mock_filter_expression +): + def mock_scan_implementation( + ExclusiveStartKey: Optional[dict[str, str]] = None, **_kwargs + ): + if not ExclusiveStartKey: + return MOCK_PAGINATED_RESPONSE_1 + elif ExclusiveStartKey.get("ID") == "id_token_for_page_2": + return MOCK_PAGINATED_RESPONSE_2 + elif ExclusiveStartKey.get("ID") == "id_token_for_page_3": + return MOCK_PAGINATED_RESPONSE_3 + + mock_project_expression = "mock_project_expression" + mock_scan_method.side_effect = mock_scan_implementation + + expected_result = EXPECTED_ITEMS_FOR_PAGINATED_RESULTS + expected_calls = [ + call( + ProjectionExpression=mock_project_expression, + FilterExpression=mock_filter_expression, + ), + call( + ProjectionExpression=mock_project_expression, + FilterExpression=mock_filter_expression, + ExclusiveStartKey={"ID": "id_token_for_page_2"}, + ), + call( + ProjectionExpression=mock_project_expression, + FilterExpression=mock_filter_expression, + ExclusiveStartKey={"ID": "id_token_for_page_3"}, + ), + ] + + actual = mock_service.scan_whole_table( + table_name=MOCK_TABLE_NAME, + project_expression=mock_project_expression, + filter_expression=mock_filter_expression, + ) + + assert expected_result == actual + + mock_service.get_table.assert_called_with(MOCK_TABLE_NAME) + mock_scan_method.assert_has_calls(expected_calls) + + +def test_scan_whole_table_omit_expression_arguments_if_not_given( + mock_service, mock_scan_method +): + mock_service.scan_whole_table( + table_name=MOCK_TABLE_NAME, + ) + + mock_service.get_table.assert_called_with(MOCK_TABLE_NAME) + mock_scan_method.assert_called_with() + + def test_get_table_when_table_exists_then_table_is_returned_successfully( mock_service, mock_dynamo_service ): diff --git a/lambdas/tests/unit/services/base/test_s3_service.py b/lambdas/tests/unit/services/base/test_s3_service.py index 63a342657..9b4d3fb05 100755 --- a/lambdas/tests/unit/services/base/test_s3_service.py +++ b/lambdas/tests/unit/services/base/test_s3_service.py @@ -8,8 +8,13 @@ TEST_NHS_NUMBER, TEST_OBJECT_KEY, ) -from tests.unit.helpers.data.s3_responses import MOCK_PRESIGNED_URL_RESPONSE +from tests.unit.helpers.data.s3_responses import ( + MOCK_LIST_OBJECTS_PAGINATED_RESPONSES, + MOCK_LIST_OBJECTS_RESPONSE, + MOCK_PRESIGNED_URL_RESPONSE, +) from utils.exceptions import TagNotFoundException +from utils.utilities import flatten TEST_DOWNLOAD_PATH = "test_path" MOCK_EVENT_BODY = { @@ -26,6 +31,7 @@ def mock_service(mocker, set_env): mocker.patch("services.base.iam_service.IAMService") service = S3Service(custom_aws_role="mock_arn_custom_role") yield service + S3Service._instance = None @pytest.fixture @@ -40,6 +46,12 @@ def mock_custom_client(mocker, mock_service): yield client +@pytest.fixture +def mock_list_objects_paginate(mock_client): + mock_paginator_method = mock_client.get_paginator.return_value.paginate + return mock_paginator_method + + def test_create_upload_presigned_url(mock_service, mock_custom_client): mock_custom_client.generate_presigned_post.return_value = ( MOCK_PRESIGNED_URL_RESPONSE @@ -248,7 +260,6 @@ def test_file_exist_on_s3_raises_client_error_if_unexpected_response( def test_s3_service_singleton_instance(mocker): mocker.patch("boto3.client") - S3Service._instance = None instance_1 = S3Service() instance_2 = S3Service() @@ -257,8 +268,6 @@ def test_s3_service_singleton_instance(mocker): def test_not_created_presigned_url_without_custom_client(mocker): - S3Service._instance = None - mocker.patch("boto3.client") mock_service = S3Service() @@ -268,8 +277,6 @@ def test_not_created_presigned_url_without_custom_client(mocker): def test_not_created_custom_client_without_client_role(mocker): - S3Service._instance = None - mocker.patch("boto3.client") iam_service = mocker.patch("services.base.iam_service.IAMService") @@ -296,3 +303,45 @@ def test_created_custom_client_when_client_role_is_passed(mocker): iam_service.assert_called() assert mock_service.custom_client == custom_client_mock iam_service_instance.assume_role.assert_called() + + +def test_list_all_objects_return_a_list_of_file_details( + mock_service, mock_client, mock_list_objects_paginate +): + mock_list_objects_paginate.return_value = [MOCK_LIST_OBJECTS_RESPONSE] + expected = MOCK_LIST_OBJECTS_RESPONSE["Contents"] + + actual = mock_service.list_all_objects(MOCK_BUCKET) + + assert actual == expected + + mock_client.get_paginator.assert_called_with("list_objects_v2") + mock_list_objects_paginate.assert_called_with(Bucket=MOCK_BUCKET) + + +def test_list_all_objects_handles_paginated_responses( + mock_service, mock_client, mock_list_objects_paginate +): + mock_list_objects_paginate.return_value = MOCK_LIST_OBJECTS_PAGINATED_RESPONSES + + expected = flatten( + [page["Contents"] for page in MOCK_LIST_OBJECTS_PAGINATED_RESPONSES] + ) + + actual = mock_service.list_all_objects(MOCK_BUCKET) + + assert actual == expected + + +def test_list_all_objects_raises_client_error_if_unexpected_response( + mock_service, mock_client, mock_list_objects_paginate +): + mock_error = ClientError( + {"Error": {"Code": "500", "Message": "Internal Server Error"}}, + "S3:ListObjectsV2", + ) + + mock_list_objects_paginate.side_effect = mock_error + + with pytest.raises(ClientError): + mock_service.list_all_objects(MOCK_BUCKET) diff --git a/lambdas/tests/unit/services/test_authoriser_service.py b/lambdas/tests/unit/services/test_authoriser_service.py index 0f953ce08..0a0bbc864 100644 --- a/lambdas/tests/unit/services/test_authoriser_service.py +++ b/lambdas/tests/unit/services/test_authoriser_service.py @@ -44,7 +44,7 @@ def mock_dynamo_service(mocker): } ], } - dynamo_service = mocker.patch.object(DynamoDBService, "simple_query") + dynamo_service = mocker.patch.object(DynamoDBService, "query_all_fields") dynamo_service.return_value = valid_session_record yield dynamo_service @@ -142,7 +142,7 @@ def test_find_login_session_raises_auth_exception( invalid_session_record = { "Count": 1, } - dynamo_service = mocker.patch.object(DynamoDBService, "simple_query") + dynamo_service = mocker.patch.object(DynamoDBService, "query_all_fields") dynamo_service.return_value = invalid_session_record with pytest.raises(AuthorisationException): diff --git a/lambdas/tests/unit/services/test_data_collection_service.py b/lambdas/tests/unit/services/test_data_collection_service.py new file mode 100644 index 000000000..044d00b1a --- /dev/null +++ b/lambdas/tests/unit/services/test_data_collection_service.py @@ -0,0 +1,434 @@ +from datetime import datetime +from decimal import Decimal +from random import shuffle +from unittest.mock import call + +import pytest +from freezegun import freeze_time +from pytest_unordered import unordered +from services.base.cloudwatch_service import CloudwatchService +from services.base.dynamo_service import DynamoDBService +from services.base.s3_service import S3Service +from services.data_collection_service import DataCollectionService +from tests.unit.conftest import ( + MOCK_ARF_BUCKET, + MOCK_ARF_TABLE_NAME, + MOCK_LG_BUCKET, + MOCK_LG_TABLE_NAME, + MOCK_STATISTICS_TABLE, +) +from tests.unit.helpers.data.statistic.mock_collected_data import ( + ALL_MOCK_DATA_AS_JSON_LIST, + MOCK_APPLICATION_DATA, + MOCK_ORGANISATION_DATA, + MOCK_RECORD_STORE_DATA, +) +from tests.unit.helpers.data.statistic.mock_dynamodb_and_s3_records import ( + MOCK_ARF_LIST_OBJECTS_RESULT, + MOCK_ARF_SCAN_RESULT, + MOCK_LG_LIST_OBJECTS_RESULT, + MOCK_LG_SCAN_RESULT, + TOTAL_FILE_SIZE_FOR_H81109, + TOTAL_FILE_SIZE_FOR_Y12345, + build_mock_results, +) +from tests.unit.helpers.data.statistic.mock_logs_query_results import ( + HASHED_USER_ID_1, + HASHED_USER_ID_2, + MOCK_LG_DELETED, + MOCK_LG_DOWNLOADED, + MOCK_LG_STORED, + MOCK_LG_VIEWED, + MOCK_UNIQUE_ACTIVE_USER_IDS, +) +from utils.cloudwatch_logs_query import ( + CloudwatchLogsQueryParams, + LloydGeorgeRecordsDeleted, + LloydGeorgeRecordsDownloaded, + LloydGeorgeRecordsStored, + LloydGeorgeRecordsViewed, + UniqueActiveUserIds, +) +from utils.common_query_filters import UploadCompleted + + +@pytest.fixture +def mock_dynamo_service(mocker): + def mock_implementation(table_name, **_kwargs): + if table_name == MOCK_LG_TABLE_NAME: + return MOCK_LG_SCAN_RESULT + elif table_name == MOCK_ARF_TABLE_NAME: + return MOCK_ARF_SCAN_RESULT + + patched_instance = mocker.patch( + "services.data_collection_service.DynamoDBService", spec=DynamoDBService + ).return_value + patched_method = patched_instance.scan_whole_table + patched_method.side_effect = mock_implementation + + yield patched_instance + + +@pytest.fixture +def mock_s3_list_all_objects(mocker): + def mock_implementation(bucket_name, **_kwargs): + if bucket_name == MOCK_LG_BUCKET: + return MOCK_LG_LIST_OBJECTS_RESULT + elif bucket_name == MOCK_ARF_BUCKET: + return MOCK_ARF_LIST_OBJECTS_RESULT + + patched_instance = mocker.patch( + "services.data_collection_service.S3Service", spec=S3Service + ).return_value + patched_method = patched_instance.list_all_objects + patched_method.side_effect = mock_implementation + + yield patched_method + + +@pytest.fixture +def mock_query_logs(mocker): + def mock_implementation(query_params: CloudwatchLogsQueryParams, **_kwargs): + if query_params == LloydGeorgeRecordsViewed: + return MOCK_LG_VIEWED + elif query_params == LloydGeorgeRecordsDownloaded: + return MOCK_LG_DOWNLOADED + elif query_params == LloydGeorgeRecordsDeleted: + return MOCK_LG_DELETED + elif query_params == LloydGeorgeRecordsStored: + return MOCK_LG_STORED + elif query_params == UniqueActiveUserIds: + return MOCK_UNIQUE_ACTIVE_USER_IDS + + patched_instance = mocker.patch( + "services.data_collection_service.CloudwatchService", + spec=CloudwatchService, + ).return_value + mocked_method = patched_instance.query_logs + mocked_method.side_effect = mock_implementation + + yield mocked_method + + +@pytest.fixture +def mock_service( + set_env, mock_query_logs, mock_dynamo_service, mock_s3_list_all_objects +): + service = DataCollectionService() + yield service + + +@pytest.fixture +def mock_uuid(mocker): + yield mocker.patch("uuid.uuid4", return_value="mock_uuid") + + +@pytest.fixture +def larger_mock_data(): + dynamodb_1, s3_1 = build_mock_results("H81109", "9000000001", 135, 123) + dynamodb_2, s3_2 = build_mock_results("H81109", "9000000002", 246, 456) + dynamodb_3, s3_3 = build_mock_results("H81109", "9000000003", 369, 789) + dynamodb_4, s3_4 = build_mock_results("Y12345", "9000000004", 4812, 9876) + dynamodb_5, s3_5 = build_mock_results("Y12345", "9000000005", 5101, 5432) + + mock_dynamo_scan_result = ( + dynamodb_1 + dynamodb_2 + dynamodb_3 + dynamodb_4 + dynamodb_5 + ) + mock_s3_list_objects_result = s3_1 + s3_2 + s3_3 + s3_4 + s3_5 + shuffle(mock_dynamo_scan_result) + shuffle(mock_s3_list_objects_result) + + return mock_dynamo_scan_result, mock_s3_list_objects_result + + +@freeze_time("2024-06-04T18:00:00Z") +def test_datetime_correctly_configured_during_initialise(set_env): + service = DataCollectionService() + + assert service.today_date == "20240604" + assert ( + service.collection_start_time + == datetime.fromisoformat("2024-06-03T18:00:00Z").timestamp() + ) + assert ( + service.collection_end_time + == datetime.fromisoformat("2024-06-04T18:00:00Z").timestamp() + ) + + +def test_collect_all_data_and_write_to_dynamodb(mock_service, mocker): + mock_collected_data = ["testing1234"] + mock_service.collect_all_data = mocker.MagicMock(return_value=mock_collected_data) + mock_service.write_to_dynamodb_table = mocker.MagicMock() + + mock_service.collect_all_data_and_write_to_dynamodb() + + mock_service.collect_all_data.assert_called_once() + mock_service.write_to_dynamodb_table.assert_called_with(mock_collected_data) + + +def test_collect_all_data(mock_service, mock_uuid): + actual = mock_service.collect_all_data() + expected = unordered( + MOCK_RECORD_STORE_DATA + MOCK_ORGANISATION_DATA + MOCK_APPLICATION_DATA + ) + + assert actual == expected + + +def test_write_to_dynamodb_table(mock_dynamo_service, mock_service): + mock_data = MOCK_RECORD_STORE_DATA + MOCK_ORGANISATION_DATA + MOCK_APPLICATION_DATA + mock_service.write_to_dynamodb_table(mock_data) + + mock_dynamo_service.batch_writing.assert_called_with( + table_name=MOCK_STATISTICS_TABLE, item_list=ALL_MOCK_DATA_AS_JSON_LIST + ) + + +def test_scan_dynamodb_tables(mock_dynamo_service, mock_service): + mock_service.scan_dynamodb_tables() + + expected_project_expression = "CurrentGpOds,NhsNumber,FileLocation,ContentType" + expected_filter_expression = UploadCompleted + + expected_calls = [ + call( + table_name=MOCK_ARF_TABLE_NAME, + project_expression=expected_project_expression, + filter_expression=expected_filter_expression, + ), + call( + table_name=MOCK_LG_TABLE_NAME, + project_expression=expected_project_expression, + filter_expression=expected_filter_expression, + ), + ] + mock_dynamo_service.scan_whole_table.assert_has_calls(expected_calls) + + +def test_get_all_s3_files_info(mock_s3_list_all_objects, mock_service): + mock_service.get_all_s3_files_info() + + expected_calls = [ + call(MOCK_ARF_BUCKET), + call(MOCK_LG_BUCKET), + ] + + mock_s3_list_all_objects.assert_has_calls(expected_calls) + + +def test_get_record_store_data(mock_service, mock_uuid): + mock_dynamo_scan_result = MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT + s3_list_objects_result = MOCK_ARF_LIST_OBJECTS_RESULT + MOCK_LG_LIST_OBJECTS_RESULT + + actual = mock_service.get_record_store_data( + mock_dynamo_scan_result, s3_list_objects_result + ) + expected = unordered(MOCK_RECORD_STORE_DATA) + + assert actual == expected + + +def test_get_organisation_data(mock_service, mock_uuid): + mock_dynamo_scan_result = MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT + + actual = mock_service.get_organisation_data(mock_dynamo_scan_result) + expected = unordered(MOCK_ORGANISATION_DATA) + + assert actual == expected + + +def test_get_application_data(mock_service, mock_uuid): + actual = mock_service.get_application_data() + expected = unordered(MOCK_APPLICATION_DATA) + + assert actual == expected + + +def test_get_active_user_list(set_env, mock_query_logs): + mock_query_logs.return_value = MOCK_UNIQUE_ACTIVE_USER_IDS + service = DataCollectionService() + expected = { + "H81109": [ + HASHED_USER_ID_1, + HASHED_USER_ID_2, + ], + "Y12345": [HASHED_USER_ID_1], + } + actual = service.get_active_user_list() + + assert actual == expected + + +@freeze_time("2024-06-04T10:25:00") +def test_get_cloud_watch_query_result(set_env, mock_query_logs): + mock_query_param = CloudwatchLogsQueryParams("mock", "test") + service = DataCollectionService() + expected_start_time = datetime.fromisoformat("2024-06-03T10:25:00").timestamp() + expected_end_time = datetime.fromisoformat("2024-06-04T10:25:00").timestamp() + + service.get_cloud_watch_query_result(mock_query_param) + + mock_query_logs.assert_called_with( + query_params=mock_query_param, + start_time=expected_start_time, + end_time=expected_end_time, + ) + + +def test_get_total_number_of_records(mock_service): + actual = mock_service.get_total_number_of_records( + MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT + ) + expected = [ + {"ods_code": "Y12345", "total_number_of_records": 2}, + {"ods_code": "H81109", "total_number_of_records": 6}, + ] + + assert actual == expected + + +def test_get_total_number_of_records_larger_mock_data(mock_service, larger_mock_data): + mock_dynamo_scan_result, _ = larger_mock_data + + actual = mock_service.get_total_number_of_records(mock_dynamo_scan_result) + expected = unordered( + [ + {"ods_code": "H81109", "total_number_of_records": 135 + 246 + 369}, + {"ods_code": "Y12345", "total_number_of_records": 4812 + 5101}, + ] + ) + + assert actual == expected + + +def test_get_number_of_patients(mock_service): + actual = mock_service.get_number_of_patients( + MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT + ) + expected = unordered( + [ + {"ods_code": "Y12345", "number_of_patients": 1}, + {"ods_code": "H81109", "number_of_patients": 2}, + ] + ) + + assert actual == expected + + +def test_get_metrics_for_total_and_average_file_sizes(mock_service): + mock_dynamo_scan_result = MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT + mock_s3_list_objects_result = ( + MOCK_ARF_LIST_OBJECTS_RESULT + MOCK_LG_LIST_OBJECTS_RESULT + ) + + actual = mock_service.get_metrics_for_total_and_average_file_sizes( + mock_dynamo_scan_result, mock_s3_list_objects_result + ) + + expected = unordered( + [ + { + "ods_code": "H81109", + "average_size_of_documents_per_patient_in_megabytes": TOTAL_FILE_SIZE_FOR_H81109 + / 2, + "total_size_of_records_in_megabytes": TOTAL_FILE_SIZE_FOR_H81109, + }, + { + "ods_code": "Y12345", + "average_size_of_documents_per_patient_in_megabytes": TOTAL_FILE_SIZE_FOR_Y12345, + "total_size_of_records_in_megabytes": TOTAL_FILE_SIZE_FOR_Y12345, + }, + ] + ) + + assert actual == expected + + +def test_get_metrics_for_total_and_average_file_sizes_larger_mock_data( + mock_service, larger_mock_data +): + mock_dynamo_scan_result, mock_s3_list_objects_result = larger_mock_data + actual = mock_service.get_metrics_for_total_and_average_file_sizes( + mock_dynamo_scan_result, mock_s3_list_objects_result + ) + + expected = unordered( + [ + { + "ods_code": "H81109", + "average_size_of_documents_per_patient_in_megabytes": (123 + 456 + 789) + / 3, + "total_size_of_records_in_megabytes": (123 + 456 + 789), + }, + { + "ods_code": "Y12345", + "average_size_of_documents_per_patient_in_megabytes": (9876 + 5432) / 2, + "total_size_of_records_in_megabytes": (9876 + 5432), + }, + ] + ) + + assert actual == expected + + +def test_get_number_of_document_types(mock_service): + actual = mock_service.get_number_of_document_types(MOCK_ARF_SCAN_RESULT) + expected = unordered( + [ + {"ods_code": "Y12345", "number_of_document_types": 2}, + {"ods_code": "H81109", "number_of_document_types": 1}, + ] + ) + + assert actual == expected + + actual = mock_service.get_number_of_document_types( + MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT + ) + expected = unordered( + [ + {"ods_code": "Y12345", "number_of_document_types": 2}, + {"ods_code": "H81109", "number_of_document_types": 2}, + ] + ) + + assert actual == expected + + +def test_get_average_number_of_file_per_patient(mock_service): + actual = mock_service.get_average_number_of_files_per_patient( + MOCK_ARF_SCAN_RESULT + MOCK_LG_SCAN_RESULT + ) + expected = unordered( + [ + {"ods_code": "Y12345", "average_records_per_patient": 2}, + {"ods_code": "H81109", "average_records_per_patient": 3}, + ] + ) + + assert actual == expected + + +def test_get_average_number_of_file_per_patient_larger_mock_data( + mock_service, larger_mock_data +): + mock_dynamo_scan_result, _ = larger_mock_data + + actual = mock_service.get_average_number_of_files_per_patient( + mock_dynamo_scan_result + ) + expected = unordered( + [ + { + "ods_code": "H81109", + "average_records_per_patient": Decimal(135 + 246 + 369) / 3, + }, + { + "ods_code": "Y12345", + "average_records_per_patient": Decimal(4812 + 5101) / 2, + }, + ] + ) + + assert actual == expected diff --git a/lambdas/tests/unit/services/test_login_service.py b/lambdas/tests/unit/services/test_login_service.py index c034b552c..9501eaefe 100644 --- a/lambdas/tests/unit/services/test_login_service.py +++ b/lambdas/tests/unit/services/test_login_service.py @@ -115,7 +115,7 @@ def test_exchange_token_respond_with_auth_token_and_repo_role( dynamo_state_query_result = {"Count": 1, "Items": [{"id": "state"}]} mocker.patch.object( - DynamoDBService, "simple_query", return_value=dynamo_state_query_result + DynamoDBService, "query_all_fields", return_value=dynamo_state_query_result ) mocker.patch.object(DynamoDBService, "delete_item") @@ -163,7 +163,7 @@ def test_exchange_token_raises_login_error_when_given_state_is_not_in_state_tabl mock_aws_infras, mock_oidc_service, mocker ): mocker.patch.object( - DynamoDBService, "simple_query", return_value={"Count": 0, "Items": []} + DynamoDBService, "query_all_fields", return_value={"Count": 0, "Items": []} ) login_service = LoginService() @@ -190,7 +190,7 @@ def keys(self): dynamo_state_query_result = {"Count": 1, "Items": [{"id": "state"}]} mocker.patch.object( - DynamoDBService, "simple_query", return_value=dynamo_state_query_result + DynamoDBService, "query_all_fields", return_value=dynamo_state_query_result ) mocker.patch.object(DynamoDBService, "delete_item") @@ -213,7 +213,7 @@ def test_exchange_token_raises_error_when_encounter_boto3_error( ): mocker.patch.object( DynamoDBService, - "simple_query", + "query_all_fields", side_effect=ClientError( {"Error": {"Code": "500", "Message": "mocked error"}}, "test" ), diff --git a/lambdas/tests/unit/services/test_statistical_report_service.py b/lambdas/tests/unit/services/test_statistical_report_service.py new file mode 100644 index 000000000..ff64f1430 --- /dev/null +++ b/lambdas/tests/unit/services/test_statistical_report_service.py @@ -0,0 +1,350 @@ +import tempfile +from random import shuffle +from unittest.mock import call + +import polars as pl +import pytest +from boto3.dynamodb.conditions import Key +from freezegun import freeze_time +from models.statistics import ApplicationData +from polars.testing import assert_frame_equal +from services.base.dynamo_service import DynamoDBService +from services.base.s3_service import S3Service +from services.statistical_report_service import StatisticalReportService +from tests.unit.conftest import MOCK_STATISTICS_REPORT_BUCKET, MOCK_STATISTICS_TABLE +from tests.unit.helpers.data.statistic.mock_data_build_utils import ( + build_random_application_data, + build_random_organisation_data, + build_random_record_store_data, +) +from tests.unit.helpers.data.statistic.mock_statistic_data import ( + ALL_MOCKED_STATISTIC_DATA, + EXPECTED_SUMMARY_APPLICATION_DATA, + EXPECTED_SUMMARY_ORGANISATION_DATA, + EXPECTED_SUMMARY_RECORD_STORE_DATA, + EXPECTED_WEEKLY_SUMMARY, + MOCK_APPLICATION_DATA_1, + MOCK_APPLICATION_DATA_2, + MOCK_APPLICATION_DATA_3, + MOCK_DYNAMODB_QUERY_RESPONSE, + MOCK_ORGANISATION_DATA_1, + MOCK_ORGANISATION_DATA_2, + MOCK_ORGANISATION_DATA_3, + MOCK_RECORD_STORE_DATA_1, + MOCK_RECORD_STORE_DATA_2, + MOCK_RECORD_STORE_DATA_3, +) +from utils.exceptions import StatisticDataNotFoundException + + +@pytest.fixture +def mock_service(set_env, mock_s3_service, mock_dynamodb_service): + return StatisticalReportService() + + +@pytest.fixture +def mock_s3_service(mocker): + patched_instance = mocker.patch( + "services.statistical_report_service.S3Service", spec=S3Service + ).return_value + + yield patched_instance + + +@pytest.fixture +def mock_dynamodb_service(mocker): + patched_instance = mocker.patch( + "services.statistical_report_service.DynamoDBService", spec=DynamoDBService + ).return_value + + yield patched_instance + + +@pytest.fixture +def mock_temp_folder(mocker): + mocker.patch.object(pl.DataFrame, "write_csv") + mocker.patch("shutil.rmtree") + temp_folder = tempfile.mkdtemp() + mocker.patch.object(tempfile, "mkdtemp", return_value=temp_folder) + yield temp_folder + + +@freeze_time("2024-06-06T18:00:00Z") +def test_datetime_correctly_configured_during_initialise(set_env): + service = StatisticalReportService() + + assert service.dates_to_collect == [ + "20240530", + "20240531", + "20240601", + "20240602", + "20240603", + "20240604", + "20240605", + ] + assert service.report_period == "20240530-20240605" + + +@freeze_time("20240512T07:00:00Z") +def test_make_weekly_summary(set_env, mocker): + data = ALL_MOCKED_STATISTIC_DATA + service = StatisticalReportService() + service.get_statistic_data = mocker.MagicMock(return_value=data) + + actual = service.make_weekly_summary() + expected = EXPECTED_WEEKLY_SUMMARY + + assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False) + + +def test_get_statistic_data(mock_dynamodb_service, mock_service): + mock_service.dates_to_collect = ["20240510", "20240511"] + mock_dynamodb_service.query_all_fields.side_effect = MOCK_DYNAMODB_QUERY_RESPONSE + + actual = mock_service.get_statistic_data() + expected = ALL_MOCKED_STATISTIC_DATA + + assert actual == expected + + expected_calls = [ + call( + table_name=MOCK_STATISTICS_TABLE, + key_condition_expression=Key("Date").eq("20240510"), + ), + call( + table_name=MOCK_STATISTICS_TABLE, + key_condition_expression=Key("Date").eq("20240511"), + ), + ] + + mock_dynamodb_service.query_all_fields.assert_has_calls(expected_calls) + + +def test_get_statistic_data_raise_error_if_all_data_are_empty( + mock_dynamodb_service, mock_service +): + mock_dynamodb_service.query_all_fields.return_value = {"Items": []} + + with pytest.raises(StatisticDataNotFoundException): + mock_service.get_statistic_data() + + +def test_summarise_record_store_data(mock_service): + actual = mock_service.summarise_record_store_data( + [MOCK_RECORD_STORE_DATA_1, MOCK_RECORD_STORE_DATA_2, MOCK_RECORD_STORE_DATA_3] + ) + + expected = EXPECTED_SUMMARY_RECORD_STORE_DATA + + assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False) + + +def test_summarise_record_store_data_larger_mock_data(mock_service): + mock_data_h81109 = build_random_record_store_data( + "H81109", ["20240601", "20240603", "20240604", "20240605", "20240607"] + ) + mock_data_y12345 = build_random_record_store_data( + "Y12345", ["20240601", "20240602", "20240603", "20240606"] + ) + mock_record_store_data = mock_data_h81109 + mock_data_y12345 + shuffle(mock_record_store_data) + + latest_record_in_h81109 = max(mock_data_h81109, key=lambda record: record.date) + latest_record_in_y12345 = max(mock_data_y12345, key=lambda record: record.date) + expected = pl.DataFrame([latest_record_in_h81109, latest_record_in_y12345]).drop( + "date", "statistic_id" + ) + + actual = mock_service.summarise_record_store_data(mock_record_store_data) + + assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False) + + +def test_summarise_record_store_data_can_handle_empty_input(mock_service): + empty_input = [] + actual = mock_service.summarise_record_store_data(empty_input) + + assert isinstance(actual, pl.DataFrame) + assert actual.is_empty() + + +def test_summarise_organisation_data(mock_service): + actual = mock_service.summarise_organisation_data( + [MOCK_ORGANISATION_DATA_1, MOCK_ORGANISATION_DATA_2, MOCK_ORGANISATION_DATA_3] + ) + + expected = EXPECTED_SUMMARY_ORGANISATION_DATA + + assert_frame_equal(actual, expected, check_row_order=False, check_dtype=False) + + +def test_summarise_organisation_data_larger_mock_data(mock_service): + mock_data_h81109 = build_random_organisation_data( + "H81109", ["20240603", "20240604", "20240605", "20240606", "20240607"] + ) + mock_data_y12345 = build_random_organisation_data( + "Y12345", ["20240603", "20240604", "20240605", "20240606", "20240607"] + ) + mock_input_data = {"H81109": mock_data_h81109, "Y12345": mock_data_y12345} + + mock_organisation_data = mock_data_h81109 + mock_data_y12345 + shuffle(mock_organisation_data) + + actual = mock_service.summarise_organisation_data(mock_organisation_data) + + for ods_code in mock_input_data.keys(): + mock_data_of_ods_code = mock_input_data[ods_code] + row_in_actual_data = actual.filter(pl.col("ods_code") == ods_code) + assert_weekly_counts_match_sum_of_daily_counts( + mock_data_of_ods_code, row_in_actual_data + ) + assert_average_record_per_patient_correct( + mock_data_of_ods_code, row_in_actual_data + ) + assert_number_of_patient_correct(mock_data_of_ods_code, row_in_actual_data) + + +def assert_weekly_counts_match_sum_of_daily_counts(mock_data, row_in_actual_data): + for count_type in ["viewed", "downloaded", "stored", "deleted"]: + expected_weekly_count = sum( + getattr(data, f"daily_count_{count_type}") for data in mock_data + ) + actual_weekly_count = row_in_actual_data.item(0, f"weekly_count_{count_type}") + + assert actual_weekly_count == expected_weekly_count + + +def assert_average_record_per_patient_correct(mock_data, row_in_actual_data): + expected_average_patient_record = sum( + data.average_records_per_patient for data in mock_data + ) / len(mock_data) + actual_average_patient_record = row_in_actual_data.item( + 0, "average_records_per_patient" + ) + + assert actual_average_patient_record == float(expected_average_patient_record) + + +def assert_number_of_patient_correct(mock_data, row_in_actual_data): + most_recent_record_in_mock_data = max(mock_data, key=lambda data: data.date) + expected_number_of_patients = most_recent_record_in_mock_data.number_of_patients + actual_number_of_patient = row_in_actual_data.item(0, "number_of_patients") + + assert actual_number_of_patient == expected_number_of_patients + + +def test_summarise_organisation_data_can_handle_empty_input(mock_service): + empty_input = [] + actual = mock_service.summarise_organisation_data(empty_input) + + assert isinstance(actual, pl.DataFrame) + assert actual.is_empty() + + +def test_summarise_application_data(mock_service): + mock_data = [ + MOCK_APPLICATION_DATA_1, + MOCK_APPLICATION_DATA_2, + MOCK_APPLICATION_DATA_3, + ] + + expected = EXPECTED_SUMMARY_APPLICATION_DATA + actual = mock_service.summarise_application_data(mock_data) + + assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False) + + +def test_summarise_application_data_larger_mock_data(mock_service): + mock_data_h81109 = build_random_application_data( + "H81109", ["20240603", "20240604", "20240605", "20240606", "20240607"] + ) + mock_data_y12345 = build_random_application_data( + "Y12345", ["20240603", "20240604", "20240605", "20240606", "20240607"] + ) + mock_organisation_data = mock_data_h81109 + mock_data_y12345 + shuffle(mock_organisation_data) + + active_users_count_h81109 = count_unique_user_ids(mock_data_h81109) + active_users_count_y12345 = count_unique_user_ids(mock_data_y12345) + + expected = pl.DataFrame( + [ + {"ods_code": "H81109", "active_users_count": active_users_count_h81109}, + {"ods_code": "Y12345", "active_users_count": active_users_count_y12345}, + ] + ) + actual = mock_service.summarise_application_data(mock_organisation_data) + + assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False) + + +def count_unique_user_ids(mock_data: list[ApplicationData]) -> int: + active_users_of_each_day = [set(data.active_user_ids_hashed) for data in mock_data] + unique_active_users_for_whole_week = set.union(*active_users_of_each_day) + return len(unique_active_users_for_whole_week) + + +def test_summarise_application_data_can_handle_empty_input(mock_service): + empty_input = [] + actual = mock_service.summarise_application_data(empty_input) + + assert isinstance(actual, pl.DataFrame) + assert actual.is_empty() + + +def test_join_dataframes_by_ods_code(mock_service): + mock_data_1 = pl.DataFrame([{"ods_code": "Y12345", "field1": "apple"}]) + mock_data_2 = pl.DataFrame( + [ + {"ods_code": "Y12345", "field2": "banana"}, + {"ods_code": "Z56789", "field2": "cherry"}, + ] + ) + + expected = pl.DataFrame( + [ + {"ods_code": "Y12345", "field1": "apple", "field2": "banana"}, + {"ods_code": "Z56789", "field2": "cherry"}, + ] + ) + actual = mock_service.join_dataframes_by_ods_code([mock_data_1, mock_data_2]) + + assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False) + + +def test_join_dataframes_by_ods_code_can_handle_empty_dataframe(mock_service): + mock_data_1 = pl.DataFrame([{"ods_code": "Y12345", "field1": "cat"}]) + mock_data_2 = pl.DataFrame() + mock_data_3 = pl.DataFrame( + [ + {"ods_code": "Y12345", "field2": "dog"}, + {"ods_code": "Z56789", "field3": "lizard"}, + ] + ) + + expected = pl.DataFrame( + [ + {"ods_code": "Y12345", "field1": "cat", "field2": "dog"}, + {"ods_code": "Z56789", "field3": "lizard"}, + ] + ) + actual = mock_service.join_dataframes_by_ods_code( + [mock_data_1, mock_data_2, mock_data_3] + ) + + assert_frame_equal(actual, expected, check_dtype=False, check_row_order=False) + + +@freeze_time("20240512T07:00:00Z") +def test_store_report_to_s3(set_env, mock_s3_service, mock_temp_folder): + mock_weekly_summary = EXPECTED_WEEKLY_SUMMARY + expected_filename = "statistical_report_20240505-20240511.csv" + expected_local_file_path = f"{mock_temp_folder}/{expected_filename}" + + service = StatisticalReportService() + + service.store_report_to_s3(mock_weekly_summary) + + mock_s3_service.upload_file.assert_called_with( + expected_local_file_path, MOCK_STATISTICS_REPORT_BUCKET, expected_filename + ) diff --git a/lambdas/tests/unit/utils/test_utilities.py b/lambdas/tests/unit/utils/test_utilities.py index 8dd31f6fe..e81617629 100755 --- a/lambdas/tests/unit/utils/test_utilities.py +++ b/lambdas/tests/unit/utils/test_utilities.py @@ -2,6 +2,7 @@ from utils.exceptions import InvalidResourceIdException from utils.utilities import ( camelize_dict, + flatten, get_file_key_from_s3_url, redact_id_to_last_4_chars, validate_nhs_number, @@ -48,3 +49,12 @@ def test_get_file_key_from_s3_url(): actual = get_file_key_from_s3_url(test_url) assert actual == expected + + +def test_flatten_reduce_one_level_of_nesting_given_a_nested_list(): + nested_list = [["a", "b", "c"], ["d", "e"], ["f"], ["a"]] + expected = ["a", "b", "c", "d", "e", "f", "a"] + + actual = flatten(nested_list) + + assert actual == expected diff --git a/lambdas/utils/cloudwatch_logs_query.py b/lambdas/utils/cloudwatch_logs_query.py new file mode 100644 index 000000000..21d89130a --- /dev/null +++ b/lambdas/utils/cloudwatch_logs_query.py @@ -0,0 +1,53 @@ +from dataclasses import dataclass + + +@dataclass +class CloudwatchLogsQueryParams: + lambda_name: str + query_string: str + + +LloydGeorgeRecordsViewed = CloudwatchLogsQueryParams( + lambda_name="LloydGeorgeStitchLambda", + query_string=""" + fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code + | filter Message = 'User has viewed Lloyd George records' + | stats count() AS daily_count_viewed BY ods_code + """, +) + +LloydGeorgeRecordsDownloaded = CloudwatchLogsQueryParams( + lambda_name="DocumentManifestByNHSNumberLambda", + query_string=""" + fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code + | filter Message = 'User has downloaded Lloyd George records' + | stats count() AS daily_count_downloaded BY ods_code + """, +) + +LloydGeorgeRecordsDeleted = CloudwatchLogsQueryParams( + lambda_name="DeleteDocRefLambda", + query_string=""" + fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code + | filter Message = "Deleted document of type LG" + | stats count() AS daily_count_deleted BY ods_code + """, +) + +LloydGeorgeRecordsStored = CloudwatchLogsQueryParams( + lambda_name="UploadConfirmResultLambda", + query_string=""" + fields @timestamp, Message, Authorisation.selected_organisation.org_ods_code AS ods_code + | filter Message = 'Finished processing all documents' + | stats count() AS daily_count_stored BY ods_code + """, +) + +UniqueActiveUserIds = CloudwatchLogsQueryParams( + lambda_name="AuthoriserLambda", + query_string=""" + fields @timestamp, Authorisation.selected_organisation.org_ods_code AS ods_code, Authorisation.nhs_user_id AS user_id + | filter ispresent(ods_code) AND ispresent(user_id) + | dedup(ods_code, user_id) + """, +) diff --git a/lambdas/utils/decorators/ensure_env_var.py b/lambdas/utils/decorators/ensure_env_var.py index 6a3cb091a..1e0da023b 100644 --- a/lambdas/utils/decorators/ensure_env_var.py +++ b/lambdas/utils/decorators/ensure_env_var.py @@ -3,6 +3,7 @@ from enums.lambda_error import LambdaError from utils.audit_logging_setup import LoggingService +from utils.exceptions import MissingEnvVarException from utils.lambda_response import ApiGatewayResponse logger = LoggingService(__name__) @@ -37,3 +38,34 @@ def interceptor(event, context): return interceptor return wrapper + + +def ensure_environment_variables_for_non_webapi(names: list[str]) -> Callable: + """A decorator for lambda handler. + Verify that the lambda environment got a set of specific environment variables. + If not, log and throw an error. + Use for lambdas that are NOT supposed to be integrated with API Gateway. + + Usage: + @ensure_environment_variables_for_non_webapi(names=["LLOYD_GEORGE_BUCKET_NAME", "LLOYD_GEORGE_DYNAMODB_NAME"]) + def lambda_handler(event, context): + ... + """ + + def wrapper(lambda_func: Callable): + def interceptor(event, context): + missing_env_vars = set(names) - set(os.environ) + if missing_env_vars: + missing_env_vars_in_string = ", ".join(sorted(missing_env_vars)) + error_body = LambdaError.EnvMissing.create_error_body( + {"name": missing_env_vars_in_string} + ) + logger.error(error_body, {"Result": "Failed to run lambda"}) + raise MissingEnvVarException(error_body) + + # Validation done. Return control flow to original lambda handler + return lambda_func(event, context) + + return interceptor + + return wrapper diff --git a/lambdas/utils/exceptions.py b/lambdas/utils/exceptions.py index 5c230edf9..098ae2891 100644 --- a/lambdas/utils/exceptions.py +++ b/lambdas/utils/exceptions.py @@ -114,3 +114,11 @@ class FhirResourceNotFound(Exception): class FileUploadInProgress(Exception): pass + + +class LogsQueryException(Exception): + pass + + +class StatisticDataNotFoundException(Exception): + pass diff --git a/lambdas/utils/utilities.py b/lambdas/utils/utilities.py index cacd9d4d4..7c3493d59 100755 --- a/lambdas/utils/utilities.py +++ b/lambdas/utils/utilities.py @@ -1,3 +1,4 @@ +import itertools import os import re import uuid @@ -54,3 +55,7 @@ def redact_id_to_last_4_chars(str_id: str) -> str: def get_file_key_from_s3_url(s3_url: str) -> str: return urlparse(s3_url).path.lstrip("/") + + +def flatten(nested_list: list[list]) -> list: + return list(itertools.chain(*nested_list)) diff --git a/sonar-project.properties b/sonar-project.properties index 274a61e0a..27633ba18 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -14,7 +14,7 @@ sonar.python.coverage.reportPaths=lambdas/coverage.xml sonar.sources=lambdas/,app/src/ sonar.tests=lambdas/tests/,app/src/ -sonar.exclusions=**/*.test.tsx,app/src/helpers/test/,**/*.story.tsx,**/TestPanel.tsx,lambdas/scripts/,**/*.test.ts,**/*.story.ts,lambdas/tests/ +sonar.exclusions=**/*.test.tsx,app/src/helpers/test/,**/*.story.tsx,**/TestPanel.tsx,lambdas/scripts/,**/*.test.ts,**/*.story.ts,lambdas/tests/* sonar.test.inclusions=**/*.test.tsx,app/src/helpers/test/,**/*.test.ts # Encoding of the source code. Default is default system encoding