Skip to content

Commit

Permalink
Merge pull request #28 from GSA-TTS/reports
Browse files Browse the repository at this point in the history
Data Submission Reports Feature
  • Loading branch information
akuny authored Feb 9, 2024
2 parents b57741b + 8f3867f commit bba1eb3
Show file tree
Hide file tree
Showing 21 changed files with 388 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 88
exclude = venv,.git,__pycache__,docs/source/conf.py,old,build,dist,node_modules
ignore = F403, F401
ignore = F403, F401, W503
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,5 +171,8 @@ control/
# Zipped artifacts
*.zip

# frontend dependencies
node_modules/
# Frontend dependencies
node_modules/

# Local notes
.notes
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ ENV PATH="${PATH}:/opt/poetry/bin"
# Install dependencies and start app
WORKDIR /app
COPY pyproject.toml poetry.lock ./
RUN poetry install --only main
RUN poetry install --without dev
COPY . .
CMD ["/bin/sh", "start_local.sh"]
30 changes: 30 additions & 0 deletions alembic/versions/851709d3a162_add_report_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Add report column
Revision ID: 851709d3a162
Revises: c5596492c87b
Create Date: 2024-02-06 13:22:25.266733
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
from sqlalchemy.types import JSON


# revision identifiers, used by Alembic.
revision: str = "851709d3a162"
down_revision: Union[str, None] = "c5596492c87b"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade():
op.add_column(
"data_submissions",
sa.Column("report", JSON, nullable=True),
)


def add_column():
op.drop_column("data_submissions", "report")
68 changes: 67 additions & 1 deletion nad_ch/application/dtos.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,73 @@
from dataclasses import dataclass
from dataclasses import dataclass, asdict, field, is_dataclass
from typing import List
import numpy as np


@dataclass
class DownloadResult:
temp_dir: str
extracted_dir: str


@dataclass
class DataSubmissionReportOverview:
feature_count: int = 0
features_flagged: int = 0
etl_update_required: bool = False
data_update_required: bool = False


@dataclass
class DataSubmissionReportFeature:
provided_feature_name: str
nad_feature_name: str
populated_count: int
null_count: int


@dataclass
class DataSubmissionReport:
overview: DataSubmissionReportOverview
features: List[DataSubmissionReportFeature] = field(default_factory=list)


def report_to_dict(data_submission_report: DataSubmissionReport) -> dict:
"""
Converts a DataSubmissionReport instance into a dictionary because all data types
within the dictionary must be JSON-serializable.
"""
return convert(asdict(data_submission_report))


def report_from_dict(data: dict) -> DataSubmissionReport:
"""
Creates a DataSubmissionReport instance from a dictionary, reconstructing the
overview and features properties from their respective dictionary representations.
"""

overview_data = data.get("overview", {})
features_data = data.get("features", [])

overview = DataSubmissionReportOverview(**overview_data)
features = [
DataSubmissionReportFeature(**feature_data) for feature_data in features_data
]

return DataSubmissionReport(overview=overview, features=features)


def convert(item):
"""
Recursively converts items within a data structure (including dictionaries, lists,
and dataclass instances) such that all numeric types are JSON-serializable.
"""
if isinstance(item, dict):
return {k: convert(v) for k, v in item.items()}
elif isinstance(item, list):
return [convert(i) for i in item]
elif isinstance(item, (np.int64, np.int32, np.float64, np.float32)):
return int(item) if isinstance(item, (np.int64, np.int32)) else float(item)
elif is_dataclass(item):
return convert(asdict(item))
else:
return item
4 changes: 3 additions & 1 deletion nad_ch/application/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def cleanup_temp_dir(self, temp_dir: str) -> bool:


class TaskQueue(Protocol):
def run_load_and_validate(self, path: str):
def run_load_and_validate(
self, submissions: DataSubmissionRepository, submission_id: int, path: str
):
...


Expand Down
6 changes: 4 additions & 2 deletions nad_ch/application/use_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ def validate_data_submission(ctx: ApplicationContext, filename: str):
ctx.logger.error("Data extration error")
return

result = ctx.task_queue.run_load_and_validate(download_result.extracted_dir)
report = ctx.task_queue.run_load_and_validate(
ctx.submissions, submission.id, download_result.extracted_dir
)

ctx.logger.info(f"Total number of features: {result.get()}")
ctx.logger.info(f"Total number of features: {report.overview.feature_count}")

ctx.storage.cleanup_temp_dir(download_result.temp_dir)
21 changes: 21 additions & 0 deletions nad_ch/application/validation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,26 @@
from typing import List
from geopandas import GeoDataFrame
from nad_ch.application.dtos import DataSubmissionReportFeature


def get_feature_count(gdf: GeoDataFrame) -> int:
return len(gdf)


def get_feature_details(gdf: GeoDataFrame) -> List[DataSubmissionReportFeature]:
report_features = []

for column in gdf.columns:
populated_count = gdf[column].notna().sum()
null_count = gdf[column].isna().sum()

report_feature = DataSubmissionReportFeature(
provided_feature_name=column,
nad_feature_name=column,
populated_count=populated_count,
null_count=null_count,
)

report_features.append(report_feature)

return report_features
6 changes: 3 additions & 3 deletions nad_ch/config/development_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,18 @@

class DevLocalApplicationContext(ApplicationContext):
def __init__(self):
self._session = create_session_factory(DATABASE_URL)
self._session_factory = create_session_factory(DATABASE_URL)
self._providers = self.create_provider_repository()
self._submissions = self.create_submission_repository()
self._logger = self.create_logger()
self._storage = self.create_storage()
self._task_queue = self.create_task_queue()

def create_provider_repository(self):
return SqlAlchemyDataProviderRepository(self._session)
return SqlAlchemyDataProviderRepository(self._session_factory)

def create_submission_repository(self):
return SqlAlchemyDataSubmissionRepository(self._session)
return SqlAlchemyDataSubmissionRepository(self._session_factory)

def create_logger(self):
return BasicLogger(__name__, logging.DEBUG)
Expand Down
6 changes: 3 additions & 3 deletions nad_ch/config/development_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,18 @@ def get_credentials(service_name, default={}):

class DevRemoteApplicationContext(ApplicationContext):
def __init__(self):
self._session = create_session_factory(DATABASE_URL)
self._session_factory = create_session_factory(DATABASE_URL)
self._providers = self.create_provider_repository()
self._submissions = self.create_submission_repository()
self._logger = self.create_logger()
self._storage = self.create_storage()
self._task_queue = self.create_task_queue()

def create_provider_repository(self):
return SqlAlchemyDataProviderRepository(self._session)
return SqlAlchemyDataProviderRepository(self._session_factory)

def create_submission_repository(self):
return SqlAlchemyDataSubmissionRepository(self._session)
return SqlAlchemyDataSubmissionRepository(self._session_factory)

def create_logger(self):
return BasicLogger(__name__)
Expand Down
5 changes: 5 additions & 0 deletions nad_ch/domain/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ def __init__(
self,
filename: str,
provider: DataProvider,
report=None,
id: int = None,
):
super().__init__(id)
self.filename = filename
self.provider = provider
self.report = report

def __repr__(self):
return f"DataSubmission \
Expand All @@ -55,3 +57,6 @@ def generate_filename(file_path: str, provider: DataProvider) -> str:
_, file_extension = os.path.splitext(file_path)
filename = f"{formatted_provider_name}_{datetime_str}{file_extension}"
return filename

def has_report(self) -> bool:
return self.report is not None
3 changes: 3 additions & 0 deletions nad_ch/domain/repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ def get_by_provider(self, provider: DataProvider) -> Iterable[DataSubmission]:

def get_by_filename() -> Optional[DataSubmission]:
...

def update_report(self, submission_id: int, report) -> None:
...
42 changes: 29 additions & 13 deletions nad_ch/infrastructure/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from sqlalchemy import Column, Integer, String, create_engine, ForeignKey, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base, relationship, Session
from sqlalchemy.sql import func
from sqlalchemy.types import JSON
import contextlib
from nad_ch.domain.entities import DataProvider, DataSubmission
from nad_ch.domain.repositories import DataProviderRepository, DataSubmissionRepository
Expand All @@ -16,7 +17,7 @@ def create_session_factory(connection_string: str):

@contextlib.contextmanager
def session_scope(session_factory):
session = session_factory
session = session_factory()
try:
yield session
session.commit()
Expand Down Expand Up @@ -76,6 +77,7 @@ class DataSubmissionModel(CommonBase):

filename = Column(String)
data_provider_id = Column(Integer, ForeignKey("data_providers.id"))
report = Column(JSON)

data_provider = relationship("DataProviderModel", back_populates="data_submissions")

Expand All @@ -84,12 +86,15 @@ def from_entity(submission):
model = DataSubmissionModel(
id=submission.id,
filename=submission.filename,
report=submission.report,
data_provider_id=submission.provider.id,
)
return model

def to_entity(self, provider: DataProvider):
entity = DataSubmission(id=self.id, filename=self.filename, provider=provider)
entity = DataSubmission(
id=self.id, filename=self.filename, report=self.report, provider=provider
)

if self.created_at is not None:
entity.set_created_at(self.created_at)
Expand All @@ -101,19 +106,19 @@ def to_entity(self, provider: DataProvider):


class SqlAlchemyDataProviderRepository(DataProviderRepository):
def __init__(self, session: Session):
self.session_factory = session
def __init__(self, session_factory):
self.session_factory = session_factory

def add(self, provider: DataProvider) -> DataProvider:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
provider_model = DataProviderModel.from_entity(provider)
session.add(provider_model)
session.commit()
session.refresh(provider_model)
return provider_model.to_entity()

def get_by_name(self, name: str) -> Optional[DataProvider]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
provider_model = (
session.query(DataProviderModel)
.filter(DataProviderModel.name == name)
Expand All @@ -122,18 +127,18 @@ def get_by_name(self, name: str) -> Optional[DataProvider]:
return provider_model.to_entity() if provider_model else None

def get_all(self) -> List[DataProvider]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
provider_models = session.query(DataProviderModel).all()
providers_entities = [provider.to_entity() for provider in provider_models]
return providers_entities


class SqlAlchemyDataSubmissionRepository(DataSubmissionRepository):
def __init__(self, session: Session):
self.session_factory = session
def __init__(self, session_factory):
self.session_factory = session_factory

def add(self, submission: DataSubmission) -> DataSubmission:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
submission_model = DataSubmissionModel.from_entity(submission)
session.add(submission_model)
session.commit()
Expand All @@ -146,7 +151,7 @@ def add(self, submission: DataSubmission) -> DataSubmission:
return submission_model.to_entity(provider_model.to_entity())

def get_by_id(self, id: int) -> Optional[DataSubmission]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
result = (
session.query(DataSubmissionModel, DataProviderModel)
.join(
Expand All @@ -164,7 +169,7 @@ def get_by_id(self, id: int) -> Optional[DataSubmission]:
return None

def get_by_provider(self, provider: DataProvider) -> List[DataSubmission]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
submission_models = (
session.query(DataSubmissionModel)
.filter(DataSubmissionModel.data_provider_id == provider.id)
Expand All @@ -176,7 +181,7 @@ def get_by_provider(self, provider: DataProvider) -> List[DataSubmission]:
return submission_entities

def get_by_filename(self, filename: str) -> Optional[DataSubmission]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
result = (
session.query(DataSubmissionModel, DataProviderModel)
.join(
Expand All @@ -192,3 +197,14 @@ def get_by_filename(self, filename: str) -> Optional[DataSubmission]:
return submission_model.to_entity(provider_model.to_entity())
else:
return None

def update_report(self, id: int, report) -> None:
with session_scope(self.session_factory) as session:
model_instance = (
session.query(DataSubmissionModel)
.filter(DataSubmissionModel.id == id)
.first()
)

if model_instance:
model_instance.report = report
Loading

0 comments on commit bba1eb3

Please sign in to comment.