Skip to content

Commit

Permalink
Merge branch 'main' into PRMP-1188
Browse files Browse the repository at this point in the history
  • Loading branch information
steph-torres-nhs authored Dec 2, 2024
2 parents 7943515 + b9a3138 commit 8422e69
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 67 deletions.
3 changes: 2 additions & 1 deletion lambdas/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.11-alpine
FROM python:3.11

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
Expand All @@ -8,6 +8,7 @@ WORKDIR /lambdas
COPY requirements /lambdas/requirements

RUN pip install -r requirements/layers/requirements_core_lambda_layer.txt
RUN pip install -r requirements/layers/requirements_data_lambda_layer.txt

COPY . /lambdas

Expand Down
8 changes: 8 additions & 0 deletions lambdas/scripts/batch_update_ods_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from pydantic import BaseModel, TypeAdapter, ValidationError
from requests import HTTPError
from services.base.dynamo_service import DynamoDBService
from services.data_collection_service import DataCollectionService
from services.statistical_report_service import StatisticalReportService
from utils.exceptions import PdsErrorException, PdsResponseValidationException
from utils.utilities import get_pds_service

Expand Down Expand Up @@ -233,3 +235,9 @@ def setup_logging_for_local_script():
if __name__ == "__main__":
setup_logging_for_local_script()
BatchUpdate().main()
print("Starting data collection process")
data_collection_service = DataCollectionService()
data_collection_service.collect_all_data_and_write_to_dynamodb()
print("Starting to create statistical report")
statistical_report_service = StatisticalReportService()
statistical_report_service.make_weekly_summary_and_output_to_bucket()
73 changes: 47 additions & 26 deletions lambdas/services/data_collection_service.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import hashlib
import os
from collections import Counter, defaultdict
from datetime import datetime
from datetime import datetime, timedelta

import polars as pl
from enums.metadata_field_names import DocumentReferenceMetadataFields
Expand Down Expand Up @@ -46,19 +46,17 @@ def __init__(self):
self.dynamodb_service = DynamoDBService()
self.s3_service = S3Service()

one_day = 60 * 60 * 24
time_now = int(datetime.now().timestamp())
self.end_date = datetime.combine(datetime.today(), datetime.min.time())
self.start_date = self.end_date - timedelta(days=7)

self.collection_start_time = time_now - one_day
self.collection_end_time = time_now
self.weekly_collection_start_date = int(self.start_date.timestamp())
self.weekly_collection_end_date = int(self.end_date.timestamp())
self.today_date = datetime.today().strftime("%Y%m%d")

def collect_all_data_and_write_to_dynamodb(self):
time_period_human_readable = (
f"{datetime.fromtimestamp(self.collection_start_time)}"
f" ~ {datetime.fromtimestamp(self.collection_end_time)}"
logger.info(
f"Collecting data for the past week: {self.start_date} to {self.end_date}."
)
logger.info(f"Collecting data between {time_period_human_readable}.")

all_statistic_data = self.collect_all_data()
logger.info("Finished collecting data. Will output to dynamodb table.")
Expand All @@ -75,12 +73,24 @@ def collect_all_data(self) -> list[StatisticData]:
record_store_data = self.get_record_store_data(
dynamodb_scan_result, s3_list_objects_result
)
organisation_data = self.get_organisation_data(dynamodb_scan_result)

application_data = self.get_application_data()
organisation_data = []
application_data = []

for day_start in self.generate_daily_ranges():
day_end = day_start + timedelta(days=1)
logger.info(f"Collecting data for day: {day_start} to {day_end}")

organisation_data += self.get_organisation_data(
dynamodb_scan_result, day_start, day_end
)
application_data += self.get_application_data(day_start, day_end)

return record_store_data + organisation_data + application_data

def generate_daily_ranges(self) -> list[datetime]:
return [self.start_date + timedelta(days=i) for i in range(7)]

def write_to_dynamodb_table(self, all_statistic_data: list[StatisticData]):
logger.info("Writing statistic data to dynamodb table")
item_list = []
Expand Down Expand Up @@ -162,23 +172,27 @@ def get_record_store_data(
return record_store_data_for_all_ods_code

def get_organisation_data(
self, dynamodb_scan_result: list[dict]
self, dynamodb_scan_result: list[dict], start_date: datetime, end_date: datetime
) -> list[OrganisationData]:

number_of_patients = self.get_number_of_patients(dynamodb_scan_result)
average_records_per_patient = self.get_average_number_of_files_per_patient(
dynamodb_scan_result
)
daily_count_viewed = self.get_cloud_watch_query_result(LloydGeorgeRecordsViewed)
daily_count_viewed = self.get_cloud_watch_query_result(
LloydGeorgeRecordsViewed, start_date, end_date
)
daily_count_downloaded = self.get_cloud_watch_query_result(
LloydGeorgeRecordsDownloaded
LloydGeorgeRecordsDownloaded, start_date, end_date
)
daily_count_deleted = self.get_cloud_watch_query_result(
LloydGeorgeRecordsDeleted
LloydGeorgeRecordsDeleted, start_date, end_date
)
daily_count_stored = self.get_cloud_watch_query_result(
LloydGeorgeRecordsStored, start_date, end_date
)
daily_count_stored = self.get_cloud_watch_query_result(LloydGeorgeRecordsStored)
daily_count_searched = self.get_cloud_watch_query_result(
LloydGeorgeRecordsSearched
LloydGeorgeRecordsSearched, start_date, end_date
)

joined_query_result = self.join_results_by_ods_code(
Expand All @@ -195,29 +209,33 @@ def get_organisation_data(

organisation_data_for_all_ods_code = [
OrganisationData(
date=self.today_date,
date=start_date.strftime("%Y%m%d"),
**organisation_data_properties,
)
for organisation_data_properties in joined_query_result
]

return organisation_data_for_all_ods_code

def get_application_data(self) -> list[ApplicationData]:
user_id_per_ods_code = self.get_active_user_list()
def get_application_data(
self, start_date: datetime, end_date: datetime
) -> list[ApplicationData]:
user_id_per_ods_code = self.get_active_user_list(start_date, end_date)
application_data_for_all_ods_code = [
ApplicationData(
date=self.today_date,
date=start_date.strftime("%Y%m%d"),
active_user_ids_hashed=active_user_ids_hashed,
ods_code=ods_code,
)
for ods_code, active_user_ids_hashed in user_id_per_ods_code.items()
]
return application_data_for_all_ods_code

def get_active_user_list(self) -> dict[str, list]:
def get_active_user_list(
self, start_date: datetime, end_date: datetime
) -> dict[str, list]:
query_result = self.get_cloud_watch_query_result(
query_params=UniqueActiveUserIds
query_params=UniqueActiveUserIds, start_time=start_date, end_time=end_date
)
user_ids_per_ods_code = defaultdict(list)
for entry in query_result:
Expand All @@ -232,12 +250,15 @@ def get_active_user_list(self) -> dict[str, list]:
return user_ids_per_ods_code

def get_cloud_watch_query_result(
self, query_params: CloudwatchLogsQueryParams
self,
query_params: CloudwatchLogsQueryParams,
start_time: datetime,
end_time: datetime,
) -> list[dict]:
return self.cloudwatch_service.query_logs(
query_params=query_params,
start_time=self.collection_start_time,
end_time=self.collection_end_time,
start_time=int(start_time.timestamp()),
end_time=int(end_time.timestamp()),
)

def get_total_number_of_records(
Expand Down
59 changes: 44 additions & 15 deletions lambdas/tests/unit/helpers/data/statistic/mock_collected_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from datetime import datetime
from datetime import datetime, timedelta

from models.statistics import ApplicationData, OrganisationData, RecordStoreData
from models.statistics import (
ApplicationData,
OrganisationData,
RecordStoreData,
StatisticData,
)
from tests.unit.conftest import TEST_UUID
from tests.unit.helpers.data.statistic.mock_dynamodb_and_s3_records import (
TOTAL_FILE_SIZE_FOR_H81109,
TOTAL_FILE_SIZE_FOR_Y12345,
Expand All @@ -11,12 +17,15 @@
HASHED_USER_ID_2_WITH_CLINICAL_ROLE,
)

TODAY_DATE = datetime.today().strftime("%Y%m%d")
START_DATE = datetime(2024, 5, 28, 10, 25, 0)
END_DATE = datetime(2024, 6, 4, 10, 25, 0)
START_DATE_STR = START_DATE.strftime("%Y%m%d")
END_DATE_STR = END_DATE.strftime("%Y%m%d")

MOCK_RECORD_STORE_DATA = [
RecordStoreData(
statistic_id="mock_uuid",
date=TODAY_DATE,
statistic_id=TEST_UUID,
date=START_DATE_STR,
ods_code="H81109",
total_number_of_records=6,
number_of_document_types=2,
Expand All @@ -25,8 +34,8 @@
/ 2,
),
RecordStoreData(
statistic_id="mock_uuid",
date=TODAY_DATE,
statistic_id=TEST_UUID,
date=START_DATE_STR,
ods_code="Y12345",
total_number_of_records=2,
number_of_document_types=2,
Expand All @@ -37,8 +46,8 @@

MOCK_ORGANISATION_DATA = [
OrganisationData(
statistic_id="mock_uuid",
date=TODAY_DATE,
statistic_id=TEST_UUID,
date=START_DATE_STR,
ods_code="H81109",
number_of_patients=2,
average_records_per_patient=3,
Expand All @@ -49,8 +58,8 @@
daily_count_searched=30,
),
OrganisationData(
statistic_id="mock_uuid",
date=TODAY_DATE,
statistic_id=TEST_UUID,
date=START_DATE_STR,
ods_code="Y12345",
number_of_patients=1,
average_records_per_patient=2,
Expand All @@ -64,17 +73,17 @@

MOCK_APPLICATION_DATA = [
ApplicationData(
statistic_id="mock_uuid",
date=TODAY_DATE,
statistic_id=TEST_UUID,
date=START_DATE_STR,
ods_code="H81109",
active_user_ids_hashed=[
HASHED_USER_ID_1_WITH_ADMIN_ROLE,
HASHED_USER_ID_2_WITH_CLINICAL_ROLE,
],
),
ApplicationData(
statistic_id="mock_uuid",
date=TODAY_DATE,
statistic_id=TEST_UUID,
date=START_DATE_STR,
ods_code="Y12345",
active_user_ids_hashed=[HASHED_USER_ID_1_WITH_PCSE_ROLE],
),
Expand All @@ -84,3 +93,23 @@
ALL_MOCK_DATA_AS_JSON_LIST = list(
map(lambda data: data.model_dump(by_alias=True), ALL_MOCK_DATA)
)


def get_weekly_data(statistics: list[StatisticData]):
weekly_data = []
for i in range(7):
for record in statistics:
new_record = record.model_copy()
new_date = datetime.strptime(record.date, "%Y%m%d")
new_record.date = (new_date + timedelta(days=i)).strftime("%Y%m%d")
weekly_data.append(new_record)
return weekly_data


MOCK_WEEKLY_ORGANISATION_DATA = get_weekly_data(MOCK_ORGANISATION_DATA)
MOCK_WEEKLY_APPLICATION_DATA = get_weekly_data(MOCK_APPLICATION_DATA)
ALL_MOCK_WEEKLY_DATA = (
MOCK_RECORD_STORE_DATA
+ MOCK_WEEKLY_APPLICATION_DATA
+ MOCK_WEEKLY_ORGANISATION_DATA
)
Loading

0 comments on commit 8422e69

Please sign in to comment.