From c9e44b2162344e4ee0d0e027be2042ec8b0584d2 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Sat, 6 Apr 2024 12:05:56 +0200 Subject: [PATCH 01/57] Include collection stats in `DatasetURLPage` --- datalad_registry/blueprints/api/dataset_urls/models.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index ca000424..20d1a4b0 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -256,6 +256,10 @@ class Config: by_alias = False +class CollectionStats(BaseModel): + pass + + class DatasetURLPage(BaseModel): """ Model for representing a page of dataset URLs in response communication @@ -275,3 +279,8 @@ class DatasetURLPage(BaseModel): dataset_urls: list[DatasetURLRespModel] = Field( description="The list of dataset URLs in the current page" ) + collection_stats: CollectionStats = Field( + description="Statistics about the collection of dataset URLs, " + "not just the URLs in the current page but the entire collection " + "returned" + ) From 6850ec6445e620d7dd9d6d4556df6650fe5d1f4b Mon Sep 17 00:00:00 2001 From: Isaac To Date: Sat, 6 Apr 2024 15:47:54 +0200 Subject: [PATCH 02/57] Define models for representing stats of filter collections --- .../blueprints/api/dataset_urls/models.py | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 20d1a4b0..2ac6bcbd 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -9,6 +9,7 @@ BaseModel, Field, FileUrl, + NonNegativeInt, PositiveInt, StrictInt, StrictStr, @@ -256,10 +257,54 @@ class Config: by_alias = False -class CollectionStats(BaseModel): +class _AnnexDsCollectionStats(BaseModel): + """ + Model with the base components of annex dataset collection statistics + """ + + ds_count: NonNegativeInt = Field(description="The number of datasets") + size_of_annexed_files: NonNegativeInt = Field( + description="The size of annexed files" + ) + annexed_file_count: NonNegativeInt = Field( + description="The number of annexed files" + ) + + +class AnnexDsCollectionStats(BaseModel): + """ + Model for annex dataset collection statistics + """ + + unique_ds_stats: _AnnexDsCollectionStats = Field( + description="Statistics for unique datasets" + ) + stats: _AnnexDsCollectionStats = Field(description="Statistics for all datasets") + + +class NonAnnexDsStats(BaseModel): pass +class StatsSummary(BaseModel): + unique_ds_count: NonNegativeInt = Field(description="The number of unique datasets") + ds_count: NonNegativeInt = Field(description="The number of datasets") + + +class CollectionStats(BaseModel): + datalad_ds_stats: AnnexDsCollectionStats = Field( + description="Statistics for DataLad datasets" + ) + pure_annex_ds_stats: AnnexDsCollectionStats = Field( + description="Statistics for pure annex datasets" + ) + non_annex_ds_stats: NonAnnexDsStats = Field( + description="Statistics for non-annex datasets" + ) + + summary: StatsSummary = Field(description="Summary statistics") + + class DatasetURLPage(BaseModel): """ Model for representing a page of dataset URLs in response communication From 1c84c51b25d52b5fab496c5c73f475827ea75986 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Tue, 9 Apr 2024 18:10:37 -0700 Subject: [PATCH 03/57] Extract base select statement So that it can be used to calculate statistics --- datalad_registry/blueprints/api/dataset_urls/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index 85005609..9adbfc85 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -285,11 +285,11 @@ def cache_path_trans(cache_path: Path) -> str: ep = ".dataset_urls" # Endpoint of `dataset_urls` base_qry = loads(query.json(exclude={"page"}, exclude_none=True)) + base_select = db.select(RepoUrl).filter(and_(True, *constraints)) + max_per_page = 100 # The overriding limit to `per_page` provided by the requester pagination = db.paginate( - db.select(RepoUrl) - .filter(and_(True, *constraints)) - .order_by( + base_select.order_by( getattr( _ORDER_KEY_TO_SQLA_ATTR[query.order_by], query.order_dir.value )().nulls_last() From 11954acf8d935f75c3f06cd620d57f80ee334c05 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Tue, 9 Apr 2024 18:18:20 -0700 Subject: [PATCH 04/57] Replace `db.select` with `sqlalchemy.select` They are essentially the same object but `sqlalchemy.select` comes with better typing support --- datalad_registry/blueprints/api/dataset_urls/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index 9adbfc85..acba9458 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -9,7 +9,7 @@ from flask_openapi3 import APIBlueprint, Tag from lark.exceptions import GrammarError, UnexpectedInput from psycopg2.errors import UniqueViolation -from sqlalchemy import ColumnElement, and_ +from sqlalchemy import ColumnElement, and_, select from sqlalchemy.exc import IntegrityError from datalad_registry.models import RepoUrl, db @@ -92,7 +92,7 @@ def declare_dataset_url(body: DatasetURLSubmitModel): url_as_str = str(body.url) repo_url_row = db.session.execute( - db.select(RepoUrl).filter_by(url=url_as_str) + select(RepoUrl).filter_by(url=url_as_str) ).one_or_none() if repo_url_row is None: # == The URL requested to be created does not exist in the database == @@ -116,7 +116,7 @@ def declare_dataset_url(body: DatasetURLSubmitModel): # of the URL in the database db.session.rollback() repo_url_added_by_another = ( - db.session.execute(db.select(RepoUrl).filter_by(url=url_as_str)) + db.session.execute(select(RepoUrl).filter_by(url=url_as_str)) .scalars() .one_or_none() ) @@ -285,7 +285,7 @@ def cache_path_trans(cache_path: Path) -> str: ep = ".dataset_urls" # Endpoint of `dataset_urls` base_qry = loads(query.json(exclude={"page"}, exclude_none=True)) - base_select = db.select(RepoUrl).filter(and_(True, *constraints)) + base_select = select(RepoUrl).filter(and_(True, *constraints)) max_per_page = 100 # The overriding limit to `per_page` provided by the requester pagination = db.paginate( From 28d1d08469dcded189d3982d0596a1462e0510bc Mon Sep 17 00:00:00 2001 From: Isaac To Date: Tue, 9 Apr 2024 18:39:50 -0700 Subject: [PATCH 05/57] Rename `base_select` to `base_select_stmt` For consistency --- datalad_registry/blueprints/api/dataset_urls/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index acba9458..0f16dfb3 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -285,11 +285,11 @@ def cache_path_trans(cache_path: Path) -> str: ep = ".dataset_urls" # Endpoint of `dataset_urls` base_qry = loads(query.json(exclude={"page"}, exclude_none=True)) - base_select = select(RepoUrl).filter(and_(True, *constraints)) + base_select_stmt = select(RepoUrl).filter(and_(True, *constraints)) max_per_page = 100 # The overriding limit to `per_page` provided by the requester pagination = db.paginate( - base_select.order_by( + base_select_stmt.order_by( getattr( _ORDER_KEY_TO_SQLA_ATTR[query.order_by], query.order_dir.value )().nulls_last() From 2a1859d80054e410531dac116d9faa072ee6524e Mon Sep 17 00:00:00 2001 From: Isaac To Date: Tue, 9 Apr 2024 18:51:39 -0700 Subject: [PATCH 06/57] Define func for collection stats and utilize it --- .../blueprints/api/dataset_urls/__init__.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index 0f16dfb3..e0dcb0fe 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -9,7 +9,7 @@ from flask_openapi3 import APIBlueprint, Tag from lark.exceptions import GrammarError, UnexpectedInput from psycopg2.errors import UniqueViolation -from sqlalchemy import ColumnElement, and_, select +from sqlalchemy import ColumnElement, Select, and_, select from sqlalchemy.exc import IntegrityError from datalad_registry.models import RepoUrl, db @@ -23,6 +23,7 @@ from datalad_registry.utils.flask_tools import json_resp_from_str from .models import ( + CollectionStats, DatasetURLPage, DatasetURLRespBaseModel, DatasetURLRespModel, @@ -179,6 +180,17 @@ def declare_dataset_url(body: DatasetURLSubmitModel): return json_resp_from_str(resp_model, status=202) +def get_collection_stats(select_stmt: Select) -> CollectionStats: + """ + Get the statistics of the collection of dataset URLs specified by the given select + statement + + :param select_stmt: The given select statement + :return: The statistics of the collection of dataset URLs + """ + pass + + @bp.get("", responses={"200": DatasetURLPage, "400": HTTPExceptionResp}) def dataset_urls(query: QueryParams): """ @@ -356,6 +368,7 @@ def cache_path_trans(cache_path: Path) -> str: first_pg=url_for(ep, **base_qry, page=1), last_pg=url_for(ep, **base_qry, page=1 if total_pages == 0 else total_pages), dataset_urls=ds_urls, + collection_stats=get_collection_stats(base_select_stmt), ) return json_resp_from_str(page.json(exclude_none=True)) From 412ce63e1b83b6724e1afb8204ed6a339ba30b93 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 13:39:41 -0700 Subject: [PATCH 07/57] Rename `AnnexDsCollectionStats` to `AnnexDsCollectionStats` This model is more appropriate for representing the stats of DataLad dataset collection --- datalad_registry/blueprints/api/dataset_urls/models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 2ac6bcbd..ef26e3a1 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -271,9 +271,9 @@ class _AnnexDsCollectionStats(BaseModel): ) -class AnnexDsCollectionStats(BaseModel): +class DataladDsCollectionStats(BaseModel): """ - Model for annex dataset collection statistics + Model for DataLad dataset collection statistics """ unique_ds_stats: _AnnexDsCollectionStats = Field( @@ -292,10 +292,10 @@ class StatsSummary(BaseModel): class CollectionStats(BaseModel): - datalad_ds_stats: AnnexDsCollectionStats = Field( + datalad_ds_stats: DataladDsCollectionStats = Field( description="Statistics for DataLad datasets" ) - pure_annex_ds_stats: AnnexDsCollectionStats = Field( + pure_annex_ds_stats: DataladDsCollectionStats = Field( description="Statistics for pure annex datasets" ) non_annex_ds_stats: NonAnnexDsStats = Field( From 4eabb09f1d6d74dba62c3c23f9c841ed8879902a Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 13:41:35 -0700 Subject: [PATCH 08/57] Rename `_AnnexDsCollectionStats` to `AnnexDsCollectionStats` --- datalad_registry/blueprints/api/dataset_urls/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index ef26e3a1..17f852a6 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -257,7 +257,7 @@ class Config: by_alias = False -class _AnnexDsCollectionStats(BaseModel): +class AnnexDsCollectionStats(BaseModel): """ Model with the base components of annex dataset collection statistics """ @@ -276,10 +276,10 @@ class DataladDsCollectionStats(BaseModel): Model for DataLad dataset collection statistics """ - unique_ds_stats: _AnnexDsCollectionStats = Field( + unique_ds_stats: AnnexDsCollectionStats = Field( description="Statistics for unique datasets" ) - stats: _AnnexDsCollectionStats = Field(description="Statistics for all datasets") + stats: AnnexDsCollectionStats = Field(description="Statistics for all datasets") class NonAnnexDsStats(BaseModel): From 77b81b1c784828430912eb54c21672f40f138b21 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 13:58:47 -0700 Subject: [PATCH 09/57] Update description `DataladDsCollectionStats.stats` --- datalad_registry/blueprints/api/dataset_urls/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 17f852a6..59344f60 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -279,7 +279,10 @@ class DataladDsCollectionStats(BaseModel): unique_ds_stats: AnnexDsCollectionStats = Field( description="Statistics for unique datasets" ) - stats: AnnexDsCollectionStats = Field(description="Statistics for all datasets") + stats: AnnexDsCollectionStats = Field( + description="Statistics for all datasets, as individual repos, " + "without any deduplication" + ) class NonAnnexDsStats(BaseModel): From 127e9554f40109e7149a3c5ae78ad5437cc70814 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 14:03:54 -0700 Subject: [PATCH 10/57] Modify `CollectionStats.pure_annex_ds_stats` with the data model that represents it and its description --- datalad_registry/blueprints/api/dataset_urls/models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 59344f60..59bbe59e 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -298,8 +298,9 @@ class CollectionStats(BaseModel): datalad_ds_stats: DataladDsCollectionStats = Field( description="Statistics for DataLad datasets" ) - pure_annex_ds_stats: DataladDsCollectionStats = Field( - description="Statistics for pure annex datasets" + pure_annex_ds_stats: AnnexDsCollectionStats = Field( + description="Statistics for pure annex datasets, as individual repos, " + "without any deduplication" ) non_annex_ds_stats: NonAnnexDsStats = Field( description="Statistics for non-annex datasets" From bf70c4ec48f9048a401cc04d51ab4af9db24152c Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 14:06:58 -0700 Subject: [PATCH 11/57] Rename `NonAnnexDsStats` to `NonAnnexDsCollectionStats` to achieve consistency of naming --- datalad_registry/blueprints/api/dataset_urls/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 59bbe59e..8aba7ebc 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -285,7 +285,7 @@ class DataladDsCollectionStats(BaseModel): ) -class NonAnnexDsStats(BaseModel): +class NonAnnexDsCollectionStats(BaseModel): pass @@ -302,7 +302,7 @@ class CollectionStats(BaseModel): description="Statistics for pure annex datasets, as individual repos, " "without any deduplication" ) - non_annex_ds_stats: NonAnnexDsStats = Field( + non_annex_ds_stats: NonAnnexDsCollectionStats = Field( description="Statistics for non-annex datasets" ) From 549239348adadf10b88646c7b417823b10f88670 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 14:14:51 -0700 Subject: [PATCH 12/57] Improve description for `StatsSummary.ds_count` --- datalad_registry/blueprints/api/dataset_urls/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 8aba7ebc..45d46d98 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -291,7 +291,10 @@ class NonAnnexDsCollectionStats(BaseModel): class StatsSummary(BaseModel): unique_ds_count: NonNegativeInt = Field(description="The number of unique datasets") - ds_count: NonNegativeInt = Field(description="The number of datasets") + ds_count: NonNegativeInt = Field( + description="The number of datasets, as individual repos, " + "without any deduplication" + ) class CollectionStats(BaseModel): From e1e720ca2c6b2f832e630ddce4ba60272f04664a Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 14:20:40 -0700 Subject: [PATCH 13/57] Define `NonAnnexDsCollectionStats` --- datalad_registry/blueprints/api/dataset_urls/models.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 45d46d98..1e303a8d 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -286,7 +286,14 @@ class DataladDsCollectionStats(BaseModel): class NonAnnexDsCollectionStats(BaseModel): - pass + """ + Model for non-annex dataset collection statistics + """ + + ds_count: NonNegativeInt = Field( + description="The number of datasets, as individual repos, " + "without any deduplication" + ) class StatsSummary(BaseModel): From 01a90c5e3fa69afc8ecbc85540c2d89d6ecb14fa Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 10 Apr 2024 14:44:25 -0700 Subject: [PATCH 14/57] Improve description of `CollectionStats.non_annex_ds_stats` --- datalad_registry/blueprints/api/dataset_urls/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 1e303a8d..60e33ccd 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -313,7 +313,8 @@ class CollectionStats(BaseModel): "without any deduplication" ) non_annex_ds_stats: NonAnnexDsCollectionStats = Field( - description="Statistics for non-annex datasets" + description="Statistics for non-annex datasets, as individual repos, " + "without any deduplication" ) summary: StatsSummary = Field(description="Summary statistics") From 15690ad86ab39e5d8338a61ec69fd2d02a2fbea6 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 11 Apr 2024 11:08:00 -0700 Subject: [PATCH 15/57] Improve doc-string of `get_collection_stats` --- datalad_registry/blueprints/api/dataset_urls/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index e0dcb0fe..0361d820 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -187,6 +187,8 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats: :param select_stmt: The given select statement :return: The statistics of the collection of dataset URLs + + Note: The execution of this function requires the Flask app's context """ pass From 619cd36f5a98f30b8d69686db3a123070f4d541b Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 11 Apr 2024 13:52:21 -0700 Subject: [PATCH 16/57] Rename variable --- datalad_registry/blueprints/api/dataset_urls/models.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 60e33ccd..b3dd48c7 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -263,9 +263,7 @@ class AnnexDsCollectionStats(BaseModel): """ ds_count: NonNegativeInt = Field(description="The number of datasets") - size_of_annexed_files: NonNegativeInt = Field( - description="The size of annexed files" - ) + annexed_files_size: NonNegativeInt = Field(description="The size of annexed files") annexed_file_count: NonNegativeInt = Field( description="The number of annexed files" ) From 91b07a702c66fbcdd5ff43b6821a327fcd641bcb Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 11 Apr 2024 19:07:32 -0700 Subject: [PATCH 17/57] Define all supporting funcs in a `tools.py` --- .../blueprints/api/dataset_urls/__init__.py | 17 +- .../blueprints/api/dataset_urls/tools.py | 180 ++++++++++++++++++ 2 files changed, 182 insertions(+), 15 deletions(-) create mode 100644 datalad_registry/blueprints/api/dataset_urls/tools.py diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index 0361d820..a3efe8a0 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -9,7 +9,7 @@ from flask_openapi3 import APIBlueprint, Tag from lark.exceptions import GrammarError, UnexpectedInput from psycopg2.errors import UniqueViolation -from sqlalchemy import ColumnElement, Select, and_, select +from sqlalchemy import ColumnElement, and_, select from sqlalchemy.exc import IntegrityError from datalad_registry.models import RepoUrl, db @@ -23,7 +23,6 @@ from datalad_registry.utils.flask_tools import json_resp_from_str from .models import ( - CollectionStats, DatasetURLPage, DatasetURLRespBaseModel, DatasetURLRespModel, @@ -33,6 +32,7 @@ PathParams, QueryParams, ) +from .tools import get_collection_stats from .. import ( API_URL_PREFIX, COMMON_API_RESPONSES, @@ -180,19 +180,6 @@ def declare_dataset_url(body: DatasetURLSubmitModel): return json_resp_from_str(resp_model, status=202) -def get_collection_stats(select_stmt: Select) -> CollectionStats: - """ - Get the statistics of the collection of dataset URLs specified by the given select - statement - - :param select_stmt: The given select statement - :return: The statistics of the collection of dataset URLs - - Note: The execution of this function requires the Flask app's context - """ - pass - - @bp.get("", responses={"200": DatasetURLPage, "400": HTTPExceptionResp}) def dataset_urls(query: QueryParams): """ diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py new file mode 100644 index 00000000..7fd73fbb --- /dev/null +++ b/datalad_registry/blueprints/api/dataset_urls/tools.py @@ -0,0 +1,180 @@ +from sqlalchemy import Select, Subquery, and_, func, or_, select + +from datalad_registry.models import RepoUrl, db + +from .models import ( + AnnexDsCollectionStats, + CollectionStats, + DataladDsCollectionStats, + NonAnnexDsCollectionStats, + StatsSummary, +) + + +def _get_annex_ds_collection_stats(q: Subquery) -> AnnexDsCollectionStats: + """ + Get the stats of a collection of datasets that contains only of annex datasets + + :param q: The query that specifies the collection of datasets under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + ds_count, annexed_files_size, annexed_file_count = db.session.execute( + select( + func.count().label("ds_count"), + func.sum(q.c.annexed_files_in_wt_size).label("annexed_files_size"), + func.sum(q.c.annexed_files_in_wt_count).label("annexed_file_count"), + ).select_from(q) + ).one() + + return AnnexDsCollectionStats( + ds_count=ds_count, + annexed_files_size=annexed_files_size, + annexed_file_count=annexed_file_count, + ) + + +def get_unique_dl_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of Datalad datasets, considering datasets with the same `ds_id` as the same + dataset + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + grp_by_id_q = ( + select( + base_q.c.ds_id, + func.max(base_q.c.annexed_files_in_wt_size).label( + "max_annexed_files_in_wt_size" + ), + ) + .group_by(base_q.c.ds_id) + .subquery("grp_by_id_q") + ) + + grp_by_id_and_a_f_size_q = ( + select( + RepoUrl.ds_id, + RepoUrl.annexed_files_in_wt_size, + func.max(RepoUrl.annexed_files_in_wt_count).label( + "annexed_files_in_wt_count" + ), + ) + .join( + grp_by_id_q, + and_( + RepoUrl.ds_id == grp_by_id_q.c.ds_id, + or_( + grp_by_id_q.c.max_annexed_files_in_wt_size.is_(None), + RepoUrl.annexed_files_in_wt_size + == grp_by_id_q.c.max_annexed_files_in_wt_size, + ), + ), + ) + .group_by(RepoUrl.ds_id, RepoUrl.annexed_files_in_wt_size) + .subquery("grp_by_id_and_a_f_size_q") + ) + + return _get_annex_ds_collection_stats(grp_by_id_and_a_f_size_q) + + +def get_dl_ds_collection_stats_with_dups(base_q: Subquery) -> AnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of Datalad datasets, considering individual repos as a dataset regardless of + the value of `ds_id`. + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + # Select statement for getting all the Datalad datasets + dl_ds_q = select(base_q).filter(base_q.c.ds_id.is_not(None)).subquery("dl_ds_q") + + return _get_annex_ds_collection_stats(dl_ds_q) + + +def get_dl_ds_collection_stats(base_q: Subquery) -> DataladDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of Datalad datasets + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + return DataladDsCollectionStats( + unique_ds_stats=get_unique_dl_ds_collection_stats(base_q), + stats=get_dl_ds_collection_stats_with_dups(base_q), + ) + + +def get_pure_annex_ds_collection_stats() -> AnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of pure annex datasets, the annex datasets that are not Datalad datasets + + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + # Select statement for getting all the pure annex datasets + pass + + +def get_non_annex_ds_collection_stats() -> NonAnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of non-annex datasets + + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + pass + + +def get_collection_stats(select_stmt: Select) -> CollectionStats: + """ + Get the statistics of the collection of dataset URLs specified by the given select + statement + + :param select_stmt: The given select statement + :return: The statistics of the collection of dataset URLs + + Note: The execution of this function requires the Flask app's context + """ + + base_q = select_stmt.subquery("base_q") + + datalad_ds_stats = get_dl_ds_collection_stats(base_q) + pure_annex_ds_stats = get_pure_annex_ds_collection_stats() + non_annex_ds_stats = get_non_annex_ds_collection_stats() + + # Total number of datasets, as individual repos, without any deduplication + ds_count = db.session.execute( + select(func.count().label("ds_count")).select_from(base_q) + ).scalar_one() + + return CollectionStats( + datalad_ds_stats=datalad_ds_stats, + pure_annex_ds_stats=pure_annex_ds_stats, + non_annex_ds_stats=non_annex_ds_stats, + summary=StatsSummary( + unique_ds_count=datalad_ds_stats.unique_ds_stats.ds_count, ds_count=ds_count + ), + ) From e8f28ea1d96e989b0d4bcf6c0bce61f82fcaedc8 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Tue, 16 Apr 2024 18:21:28 -0700 Subject: [PATCH 18/57] Implement getting stats for pure annex ds --- .../blueprints/api/dataset_urls/tools.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py index 7fd73fbb..f59e968e 100644 --- a/datalad_registry/blueprints/api/dataset_urls/tools.py +++ b/datalad_registry/blueprints/api/dataset_urls/tools.py @@ -123,17 +123,25 @@ def get_dl_ds_collection_stats(base_q: Subquery) -> DataladDsCollectionStats: ) -def get_pure_annex_ds_collection_stats() -> AnnexDsCollectionStats: +def get_pure_annex_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionStats: """ Get the stats of the subset of the collection of datasets that contains only of pure annex datasets, the annex datasets that are not Datalad datasets + :param base_q: The base query that specified the collection of datasets + under consideration :return: The object representing the stats Note: The execution of this function requires the Flask app's context """ # Select statement for getting all the pure annex datasets - pass + pure_annex_ds_q = ( + select(base_q) + .filter(and_(base_q.c.branches.has_key("git-annex"), base_q.c.ds_id.is_(None))) + .subquery("pure_annex_ds_q") + ) + + return _get_annex_ds_collection_stats(pure_annex_ds_q) def get_non_annex_ds_collection_stats() -> NonAnnexDsCollectionStats: @@ -162,7 +170,7 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats: base_q = select_stmt.subquery("base_q") datalad_ds_stats = get_dl_ds_collection_stats(base_q) - pure_annex_ds_stats = get_pure_annex_ds_collection_stats() + pure_annex_ds_stats = get_pure_annex_ds_collection_stats(base_q) non_annex_ds_stats = get_non_annex_ds_collection_stats() # Total number of datasets, as individual repos, without any deduplication From 83d71384a06ed20b30809371928cf92b03e270a5 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 17 Apr 2024 12:25:44 -0700 Subject: [PATCH 19/57] Implement the gathering of stats for non-annex ds --- .../blueprints/api/dataset_urls/tools.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py index f59e968e..594d018b 100644 --- a/datalad_registry/blueprints/api/dataset_urls/tools.py +++ b/datalad_registry/blueprints/api/dataset_urls/tools.py @@ -1,4 +1,4 @@ -from sqlalchemy import Select, Subquery, and_, func, or_, select +from sqlalchemy import Select, Subquery, and_, func, not_, or_, select from datalad_registry.models import RepoUrl, db @@ -144,16 +144,29 @@ def get_pure_annex_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionSta return _get_annex_ds_collection_stats(pure_annex_ds_q) -def get_non_annex_ds_collection_stats() -> NonAnnexDsCollectionStats: +def get_non_annex_ds_collection_stats(base_q: Subquery) -> NonAnnexDsCollectionStats: """ Get the stats of the subset of the collection of datasets that contains only of non-annex datasets + :param base_q: The base query that specified the collection of datasets + under consideration :return: The object representing the stats Note: The execution of this function requires the Flask app's context """ - pass + # Select statement for getting all the non-annex datasets + non_annex_ds_q = ( + select(base_q) + .filter(not_(base_q.c.branches.has_key("git-annex"))) + .subquery("non_annex_ds_q") + ) + + return NonAnnexDsCollectionStats( + ds_count=db.session.execute( + select(func.count().label("ds_count")).select_from(non_annex_ds_q) + ).scalar_one() + ) def get_collection_stats(select_stmt: Select) -> CollectionStats: @@ -171,7 +184,7 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats: datalad_ds_stats = get_dl_ds_collection_stats(base_q) pure_annex_ds_stats = get_pure_annex_ds_collection_stats(base_q) - non_annex_ds_stats = get_non_annex_ds_collection_stats() + non_annex_ds_stats = get_non_annex_ds_collection_stats(base_q) # Total number of datasets, as individual repos, without any deduplication ds_count = db.session.execute( From 55728ffdd4b160634668c7b58dd8a91bfe27c737 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 17 Apr 2024 12:31:15 -0700 Subject: [PATCH 20/57] Format: Inline function calls --- datalad_registry/blueprints/api/dataset_urls/tools.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py index 594d018b..8c1a8b55 100644 --- a/datalad_registry/blueprints/api/dataset_urls/tools.py +++ b/datalad_registry/blueprints/api/dataset_urls/tools.py @@ -183,8 +183,6 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats: base_q = select_stmt.subquery("base_q") datalad_ds_stats = get_dl_ds_collection_stats(base_q) - pure_annex_ds_stats = get_pure_annex_ds_collection_stats(base_q) - non_annex_ds_stats = get_non_annex_ds_collection_stats(base_q) # Total number of datasets, as individual repos, without any deduplication ds_count = db.session.execute( @@ -193,8 +191,8 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats: return CollectionStats( datalad_ds_stats=datalad_ds_stats, - pure_annex_ds_stats=pure_annex_ds_stats, - non_annex_ds_stats=non_annex_ds_stats, + pure_annex_ds_stats=get_pure_annex_ds_collection_stats(base_q), + non_annex_ds_stats=get_non_annex_ds_collection_stats(base_q), summary=StatsSummary( unique_ds_count=datalad_ds_stats.unique_ds_stats.ds_count, ds_count=ds_count ), From 926d8352a5d8d0218756f9de202cb781cb5236bc Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 17 Apr 2024 14:10:06 -0700 Subject: [PATCH 21/57] Make `annexed_files_size` and `annexed_file_count` optional In the case of all rows have the column `annexed_files_in_wt_count` or `annexed_files_in_wt_size` be `null`, `annexed_files_size` and `annexed_file_count` can be assigned to `None` respectively --- datalad_registry/blueprints/api/dataset_urls/models.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index b3dd48c7..65ff0a65 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -263,9 +263,11 @@ class AnnexDsCollectionStats(BaseModel): """ ds_count: NonNegativeInt = Field(description="The number of datasets") - annexed_files_size: NonNegativeInt = Field(description="The size of annexed files") - annexed_file_count: NonNegativeInt = Field( - description="The number of annexed files" + annexed_files_size: Optional[NonNegativeInt] = Field( + None, description="The size of annexed files" + ) + annexed_file_count: Optional[NonNegativeInt] = Field( + None, description="The number of annexed files" ) From 6cb335d8258f5daed64e157755f37bba47c7f141 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 17 Apr 2024 18:45:43 -0700 Subject: [PATCH 22/57] Provide dummy `CollectionStats` in tests of `registry-get-urls` client --- .../tests/test_get_urls.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/datalad_registry_client/tests/test_get_urls.py b/datalad_registry_client/tests/test_get_urls.py index 5dc3e771..bc3f7bae 100644 --- a/datalad_registry_client/tests/test_get_urls.py +++ b/datalad_registry_client/tests/test_get_urls.py @@ -9,8 +9,13 @@ from yarl import URL from datalad_registry.blueprints.api.dataset_urls.models import ( + AnnexDsCollectionStats, + CollectionStats, + DataladDsCollectionStats, DatasetURLPage, DatasetURLRespModel, + NonAnnexDsCollectionStats, + StatsSummary, ) from datalad_registry_client import DEFAULT_BASE_ENDPOINT @@ -40,6 +45,20 @@ def __init__(self, status_code, text): metadata=[], ) +# A dummy `AnnexDsCollectionStats` object +annex_ds_collection_stats = AnnexDsCollectionStats( + ds_count=101, annexed_files_size=1900, annexed_file_count=42 +) +# A dummy `CollectionStats` object +collection_stats = CollectionStats( + datalad_ds_stats=DataladDsCollectionStats( + unique_ds_stats=annex_ds_collection_stats, stats=annex_ds_collection_stats + ), + pure_annex_ds_stats=annex_ds_collection_stats, + non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=40), + summary=StatsSummary(unique_ds_count=101, ds_count=999), +) + def test_register(): """ @@ -91,6 +110,7 @@ def mock_get(s, url): # noqa: U100 Unused argument url="https://www.example.com" ) ], + collection_stats=collection_stats, ).json(exclude_none=True), ) else: @@ -142,6 +162,7 @@ def mock_get(s, url): # noqa: U100 Unused argument url="https://www.example.com" ) ], + collection_stats=collection_stats, ).json(exclude_none=True), ) else: @@ -185,6 +206,7 @@ def ds_url_pgs(): DatasetURLRespModel(**dataset_url_resp_model_template, url=url) for url in pg ], + collection_stats=collection_stats, ) ds_url_pgs_iter = ds_url_pgs() @@ -250,6 +272,7 @@ def mock_responses(): url="https://www.example.com" ) ], + collection_stats=collection_stats, ).json(exclude_none=True), ) From 6fb4df43b961fd1e8ed837ddd7d84b47125f2e71 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Wed, 17 Apr 2024 19:53:13 -0700 Subject: [PATCH 23/57] Remove `total` in `DatasetURLPage` `total` is now `DatasetURLPage.collection_stats.summary.ds_count` --- .../blueprints/api/dataset_urls/__init__.py | 1 - .../blueprints/api/dataset_urls/models.py | 3 -- .../test_api/test_dataset_urls.py | 6 ++-- .../tests/test_get_urls.py | 29 ++++++++++--------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index a3efe8a0..cff6f476 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -342,7 +342,6 @@ def cache_path_trans(cache_path: Path) -> str: assert pagination.total is not None page = DatasetURLPage( - total=pagination.total, cur_pg_num=cur_pg_num, prev_pg=( url_for(ep, **base_qry, page=pagination.prev_num) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index 65ff0a65..d6f3773b 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -325,9 +325,6 @@ class DatasetURLPage(BaseModel): Model for representing a page of dataset URLs in response communication """ - total: StrictInt = Field( - description="The total number of dataset URLs across all pages" - ) cur_pg_num: StrictInt = Field(description="The number of the current page") prev_pg: Optional[StrictStr] = Field( None, description="The link to the previous page" diff --git a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py index 2a5b9387..486fffdb 100644 --- a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py +++ b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py @@ -398,12 +398,12 @@ def test_filter(self, flask_client, query_params, expected_output): ds_url_page = DatasetURLPage.parse_raw(resp.text) - assert ds_url_page.total == expected_out_count assert ds_url_page.cur_pg_num == DEFAULT_PAGE assert ds_url_page.prev_pg is None assert ds_url_page.next_pg is None assert YURL(ds_url_page.first_pg).query["page"] == "1" assert YURL(ds_url_page.last_pg).query["page"] == "1" + assert ds_url_page.collection_stats.summary.ds_count == expected_out_count # Check the collection of dataset URLs assert {i.url for i in ds_url_page.dataset_urls} == expected_output @@ -510,11 +510,11 @@ def test_pagination(self, populate_with_dataset_urls, flask_client): resp_json = resp.json ds_url_pg = DatasetURLPage.parse_obj(resp_json) - assert ds_url_pg.total == 4 assert ds_url_pg.cur_pg_num == 1 assert "prev_pg" not in resp_json assert ds_url_pg.prev_pg is None assert ds_url_pg.next_pg is not None + assert ds_url_pg.collection_stats.summary.ds_count == 4 next_pg_lk, first_pg_lk, last_pg_lk = ( YURL(pg) @@ -548,11 +548,11 @@ def test_pagination(self, populate_with_dataset_urls, flask_client): resp_json = resp.json ds_url_pg = DatasetURLPage.parse_obj(resp_json) - assert ds_url_pg.total == 4 assert ds_url_pg.cur_pg_num == 2 assert ds_url_pg.prev_pg is not None assert "next_pg" not in resp_json assert ds_url_pg.next_pg is None + assert ds_url_pg.collection_stats.summary.ds_count == 4 prev_pg_lk, first_pg_lk, last_pg_lk = ( YURL(pg) diff --git a/datalad_registry_client/tests/test_get_urls.py b/datalad_registry_client/tests/test_get_urls.py index bc3f7bae..73184529 100644 --- a/datalad_registry_client/tests/test_get_urls.py +++ b/datalad_registry_client/tests/test_get_urls.py @@ -45,18 +45,20 @@ def __init__(self, status_code, text): metadata=[], ) -# A dummy `AnnexDsCollectionStats` object +# Dummy stats objects annex_ds_collection_stats = AnnexDsCollectionStats( ds_count=101, annexed_files_size=1900, annexed_file_count=42 ) -# A dummy `CollectionStats` object +dl_ds_collection_stats = DataladDsCollectionStats( + unique_ds_stats=annex_ds_collection_stats, stats=annex_ds_collection_stats +) +non_annex_ds_collection_stats = NonAnnexDsCollectionStats(ds_count=40) +stats_summary = StatsSummary(unique_ds_count=101, ds_count=999) collection_stats = CollectionStats( - datalad_ds_stats=DataladDsCollectionStats( - unique_ds_stats=annex_ds_collection_stats, stats=annex_ds_collection_stats - ), + datalad_ds_stats=dl_ds_collection_stats, pure_annex_ds_stats=annex_ds_collection_stats, - non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=40), - summary=StatsSummary(unique_ds_count=101, ds_count=999), + non_annex_ds_stats=non_annex_ds_collection_stats, + summary=stats_summary, ) @@ -98,7 +100,6 @@ def mock_get(s, url): # noqa: U100 Unused argument return MockResponse( 200, DatasetURLPage( - total=200, cur_pg_num=1, prev_pg="dummy", next_pg=None, @@ -150,7 +151,6 @@ def mock_get(s, url): # noqa: U100 Unused argument return MockResponse( 200, DatasetURLPage( - total=100, cur_pg_num=1, prev_pg="dummy", next_pg=None, @@ -191,12 +191,11 @@ def test_handle_successful_response(self, resp_pgs: list[list[str]], monkeypatch """ def ds_url_pgs(): - total = sum(len(pg) for pg in resp_pgs) + ds_count = sum(len(pg) for pg in resp_pgs) for i, pg in enumerate(resp_pgs): # noinspection PyTypeChecker yield DatasetURLPage( - total=total, cur_pg_num=i + 1, prev_pg=None if i == 0 else "foo", next_pg=None if i == len(resp_pgs) - 1 else "foo", @@ -206,7 +205,12 @@ def ds_url_pgs(): DatasetURLRespModel(**dataset_url_resp_model_template, url=url) for url in pg ], - collection_stats=collection_stats, + collection_stats=CollectionStats( + datalad_ds_stats=dl_ds_collection_stats, + pure_annex_ds_stats=annex_ds_collection_stats, + non_annex_ds_stats=non_annex_ds_collection_stats, + summary=StatsSummary(unique_ds_count=101, ds_count=ds_count), + ), ) ds_url_pgs_iter = ds_url_pgs() @@ -260,7 +264,6 @@ def mock_responses(): yield MockResponse( 200, DatasetURLPage( - total=200, cur_pg_num=i + 1, prev_pg=None if i == 0 else "foo", next_pg="bar", From e9f027bfed9d8d3bc7e6a858c339450d26a850fa Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 18 Apr 2024 10:38:48 -0700 Subject: [PATCH 24/57] RF: Take out the URL population logic from the fixture The logic can be reuse this way --- datalad_registry/tests/conftest.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/datalad_registry/tests/conftest.py b/datalad_registry/tests/conftest.py index 00fefd81..4388095e 100644 --- a/datalad_registry/tests/conftest.py +++ b/datalad_registry/tests/conftest.py @@ -258,12 +258,28 @@ def populate_with_2_dataset_urls(flask_app): db.session.commit() +def _populate_with_dataset_urls(urls: list[RepoUrl], flask_app): + """ + Populate the `repo_url` table with a list of RepoUrl objects + + :param urls: The list of RepoUrl objects to populate + :return: The list of URLs, expressed in `str`, that were added to the database + """ + + with flask_app.app_context(): + for url in urls: + db.session.add(url) + db.session.commit() + + return [url.url for url in urls] + + @pytest.fixture def populate_with_dataset_urls(flask_app) -> list[str]: """ - Populate the url table with a list of DatasetURLs. + Populate the `repo_url` table with a list of RepoUrl objects - Returns: The list of DatasetURLs that were added to the database + Returns: The list of URLs, expressed in `str`, that were added to the database """ urls = [ @@ -319,12 +335,7 @@ def populate_with_dataset_urls(flask_app) -> list[str]: ), ] - with flask_app.app_context(): - for url in urls: - db.session.add(url) - db.session.commit() - - return [url.url for url in urls] + return _populate_with_dataset_urls(urls, flask_app) @pytest.fixture From f1f8887a4b579bbefdc38282611abea727f85e18 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 18 Apr 2024 10:45:58 -0700 Subject: [PATCH 25/57] Rename pytest fixture --- datalad_registry/tests/conftest.py | 6 +++--- .../test_blueprints/test_api/test_dataset_urls.py | 12 ++++++------ datalad_registry/tests/test_overview.py | 10 +++++----- datalad_registry/tests/test_search.py | 2 +- .../tests/test_tasks/test_chk_url_to_update.py | 6 +++--- .../tests/test_tasks/test_mark_for_chk.py | 4 ++-- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/datalad_registry/tests/conftest.py b/datalad_registry/tests/conftest.py index 4388095e..20f6a7c1 100644 --- a/datalad_registry/tests/conftest.py +++ b/datalad_registry/tests/conftest.py @@ -275,9 +275,9 @@ def _populate_with_dataset_urls(urls: list[RepoUrl], flask_app): @pytest.fixture -def populate_with_dataset_urls(flask_app) -> list[str]: +def populate_with_std_ds_urls(flask_app) -> list[str]: """ - Populate the `repo_url` table with a list of RepoUrl objects + Populate the `repo_url` table with a list of standard (typical) RepoUrl objects Returns: The list of URLs, expressed in `str`, that were added to the database """ @@ -340,7 +340,7 @@ def populate_with_dataset_urls(flask_app) -> list[str]: @pytest.fixture def populate_with_url_metadata( - populate_with_dataset_urls, # noqa: U100 (unused argument) + populate_with_std_ds_urls, # noqa: U100 (unused argument) flask_app, ): """ diff --git a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py index 486fffdb..2f13e920 100644 --- a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py +++ b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py @@ -119,7 +119,7 @@ def mock_commit(_scoped_session_obj): "/api/v2/dataset-urls", json={"url": "https://www.example.com"} ) - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "url, expected_mark_for_chk_delay_args", [ @@ -255,7 +255,7 @@ def test_valid_query_params(self, flask_client, query_params): resp = flask_client.get("/api/v2/dataset-urls", query_string=query_params) assert resp.status_code == 200 - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "query_params, expected_output", [ @@ -408,7 +408,7 @@ def test_filter(self, flask_client, query_params, expected_output): # Check the collection of dataset URLs assert {i.url for i in ds_url_page.dataset_urls} == expected_output - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "query_params", [ @@ -494,7 +494,7 @@ def test_metadata_return(self, metadata_ret_opt, flask_client): assert all(type(m) is metadata_ret_type for m in url.metadata) - def test_pagination(self, populate_with_dataset_urls, flask_client): + def test_pagination(self, populate_with_std_ds_urls, flask_client): """ Test the pagination of the results """ @@ -578,9 +578,9 @@ def test_pagination(self, populate_with_dataset_urls, flask_client): for url in ds_url_pg.dataset_urls: ds_urls.add(str(url.url)) - assert ds_urls == set(populate_with_dataset_urls) + assert ds_urls == set(populate_with_std_ds_urls) - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "query_params, expected_results_by_id_prefix", [ diff --git a/datalad_registry/tests/test_overview.py b/datalad_registry/tests/test_overview.py index 3472ec65..2763f051 100644 --- a/datalad_registry/tests/test_overview.py +++ b/datalad_registry/tests/test_overview.py @@ -6,7 +6,7 @@ class TestOverView: - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "sort_by, expected_order", [ @@ -143,7 +143,7 @@ def test_sorting( assert url_list == expected_order - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "search_query, expected_results", [ @@ -180,7 +180,7 @@ def test_search_with_valid_query( assert url_list == expected_results - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "search_query, err_msg_prefix", [ @@ -206,7 +206,7 @@ def test_search_with_invalid_query( assert (error_span := soup.find("span", class_="error")) assert error_span.text.startswith(f"ERROR: {err_msg_prefix}") - def test_pagination(self, populate_with_dataset_urls, flask_client): + def test_pagination(self, populate_with_std_ds_urls, flask_client): """ Test pagination in Web UI """ @@ -272,7 +272,7 @@ def test_pagination(self, populate_with_dataset_urls, flask_client): assert page_1_link.query["per_page"] == "2" assert page_1_link.query["sort"] == "update-desc" - assert ds_urls == set(populate_with_dataset_urls) + assert ds_urls == set(populate_with_std_ds_urls) @pytest.mark.usefixtures("populate_with_url_metadata") def test_metadata(self, flask_client): diff --git a/datalad_registry/tests/test_search.py b/datalad_registry/tests/test_search.py index 137e276f..39f85ec1 100644 --- a/datalad_registry/tests/test_search.py +++ b/datalad_registry/tests/test_search.py @@ -12,7 +12,7 @@ @pytest.fixture def populate_with_url_metadata_for_search( - populate_with_dataset_urls, # noqa: U100 (unused argument) + populate_with_std_ds_urls, # noqa: U100 (unused argument) flask_app, ): """ diff --git a/datalad_registry/tests/test_tasks/test_chk_url_to_update.py b/datalad_registry/tests/test_tasks/test_chk_url_to_update.py index 525cc0ff..ea351912 100644 --- a/datalad_registry/tests/test_tasks/test_chk_url_to_update.py +++ b/datalad_registry/tests/test_tasks/test_chk_url_to_update.py @@ -13,7 +13,7 @@ # and the db and the cache are clean @pytest.mark.usefixtures("flask_app") class TestChkUrlToUpdate: - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize("invalid_url_id", [-1, 0, 5, 10]) def test_repo_url_not_found(self, invalid_url_id): """ @@ -21,7 +21,7 @@ def test_repo_url_not_found(self, invalid_url_id): """ assert chk_url_to_update(invalid_url_id, None) is ChkUrlStatus.ABORTED - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "url_id, initial_last_chk_dt", [ @@ -38,7 +38,7 @@ def test_chk_handled_by_another_process(self, url_id, initial_last_chk_dt): """ assert chk_url_to_update(url_id, initial_last_chk_dt) is ChkUrlStatus.SKIPPED - @pytest.mark.usefixtures("populate_with_dataset_urls", "fix_datetime_now") + @pytest.mark.usefixtures("populate_with_std_ds_urls", "fix_datetime_now") @pytest.mark.parametrize( "url_id, initial_last_chk_dt, original_n_failed_chks, original_chk_req_dt", [ diff --git a/datalad_registry/tests/test_tasks/test_mark_for_chk.py b/datalad_registry/tests/test_tasks/test_mark_for_chk.py index 079c5c63..696c029d 100644 --- a/datalad_registry/tests/test_tasks/test_mark_for_chk.py +++ b/datalad_registry/tests/test_tasks/test_mark_for_chk.py @@ -11,7 +11,7 @@ # and the db and the cache are clean @pytest.mark.usefixtures("flask_app") class TestMarkForChk: - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize("url_id", [5, 42]) def test_non_existing_url(self, url_id, mocker: MockerFixture): """ @@ -32,7 +32,7 @@ def now(cls, *args, **kwargs): datetime_mock.now.assert_not_called() - @pytest.mark.usefixtures("populate_with_dataset_urls") + @pytest.mark.usefixtures("populate_with_std_ds_urls") @pytest.mark.parametrize( "url_id, original_chk_req_dt, expecting_chk_req_dt_changed", [ From f1841822a199ac655665b8d4a9def625acceae06 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 18 Apr 2024 10:57:42 -0700 Subject: [PATCH 26/57] RF: Move the URL population logic to `tools.py` This allows reuse of the logic by tests in different files --- datalad_registry/tests/conftest.py | 20 +++----------------- datalad_registry/tests/tools.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 17 deletions(-) create mode 100644 datalad_registry/tests/tools.py diff --git a/datalad_registry/tests/conftest.py b/datalad_registry/tests/conftest.py index 20f6a7c1..410ba586 100644 --- a/datalad_registry/tests/conftest.py +++ b/datalad_registry/tests/conftest.py @@ -16,6 +16,8 @@ from datalad_registry.models import RepoUrl, URLMetadata, db from datalad_registry.utils.datalad_tls import clone +from .tools import populate_with_dataset_urls + @pytest.fixture(scope="session") def set_test_env(tmp_path_factory): @@ -258,22 +260,6 @@ def populate_with_2_dataset_urls(flask_app): db.session.commit() -def _populate_with_dataset_urls(urls: list[RepoUrl], flask_app): - """ - Populate the `repo_url` table with a list of RepoUrl objects - - :param urls: The list of RepoUrl objects to populate - :return: The list of URLs, expressed in `str`, that were added to the database - """ - - with flask_app.app_context(): - for url in urls: - db.session.add(url) - db.session.commit() - - return [url.url for url in urls] - - @pytest.fixture def populate_with_std_ds_urls(flask_app) -> list[str]: """ @@ -335,7 +321,7 @@ def populate_with_std_ds_urls(flask_app) -> list[str]: ), ] - return _populate_with_dataset_urls(urls, flask_app) + return populate_with_dataset_urls(urls, flask_app) @pytest.fixture diff --git a/datalad_registry/tests/tools.py b/datalad_registry/tests/tools.py new file mode 100644 index 00000000..3c64dae4 --- /dev/null +++ b/datalad_registry/tests/tools.py @@ -0,0 +1,21 @@ +# This file contains helper functions for testing purposes + +from datalad_registry.models import RepoUrl, db + + +def populate_with_dataset_urls(urls: list[RepoUrl], flask_app): + """ + Populate the `repo_url` table with a list of RepoUrl objects + + :param urls: The list of RepoUrl objects to populate + :param flask_app: The Flask app instance which provides the context for + database access + :return: The list of URLs, expressed in `str`, that were added to the database + """ + + with flask_app.app_context(): + for url in urls: + db.session.add(url) + db.session.commit() + + return [url.url for url in urls] From bda5f9c4f072ee3bf9f7f9d3f1d67de5b9b5a8e3 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 18 Apr 2024 11:03:06 -0700 Subject: [PATCH 27/57] RF: Simplify the expression to add RepoUrls to DB --- datalad_registry/tests/tools.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datalad_registry/tests/tools.py b/datalad_registry/tests/tools.py index 3c64dae4..34c7d12d 100644 --- a/datalad_registry/tests/tools.py +++ b/datalad_registry/tests/tools.py @@ -14,8 +14,7 @@ def populate_with_dataset_urls(urls: list[RepoUrl], flask_app): """ with flask_app.app_context(): - for url in urls: - db.session.add(url) + db.session.add_all(urls) db.session.commit() return [url.url for url in urls] From ee603688d7e3971ced73b2fd45c5b8708b4509f3 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 18 Apr 2024 12:23:04 -0700 Subject: [PATCH 28/57] Fully type annotate `populate_with_dataset_urls` --- datalad_registry/tests/tools.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datalad_registry/tests/tools.py b/datalad_registry/tests/tools.py index 34c7d12d..f07a9ca7 100644 --- a/datalad_registry/tests/tools.py +++ b/datalad_registry/tests/tools.py @@ -1,9 +1,11 @@ # This file contains helper functions for testing purposes +from flask import Flask + from datalad_registry.models import RepoUrl, db -def populate_with_dataset_urls(urls: list[RepoUrl], flask_app): +def populate_with_dataset_urls(urls: list[RepoUrl], flask_app: Flask) -> list[str]: """ Populate the `repo_url` table with a list of RepoUrl objects From e8c0d1c8e02680032dc7c2516630e4f824fb6c07 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Thu, 18 Apr 2024 17:09:50 -0700 Subject: [PATCH 29/57] Provide tests for return dataset collect stats --- .../test_api/test_dataset_urls.py | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) diff --git a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py index 2f13e920..9e22c6f1 100644 --- a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py +++ b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py @@ -7,14 +7,21 @@ from datalad_registry.blueprints.api.dataset_urls import DatasetURLRespModel from datalad_registry.blueprints.api.dataset_urls.models import ( DEFAULT_PAGE, + AnnexDsCollectionStats, + CollectionStats, + DataladDsCollectionStats, DatasetURLPage, MetadataReturnOption, + NonAnnexDsCollectionStats, + StatsSummary, ) from datalad_registry.blueprints.api.url_metadata.models import ( URLMetadataModel, URLMetadataRef, ) from datalad_registry.conf import OperationMode +from datalad_registry.models import RepoUrl +from datalad_registry.tests.tools import populate_with_dataset_urls class TestDeclareDatasetURL: @@ -675,6 +682,242 @@ def test_ordering(self, query_params, expected_results_by_id_prefix, flask_clien == expected_results_by_id_prefix ) + @pytest.mark.parametrize( + "query_params, expected_stats", + [ + ( + {}, + CollectionStats( + datalad_ds_stats=DataladDsCollectionStats( + unique_ds_stats=AnnexDsCollectionStats( + ds_count=3, + annexed_files_size=400 + 1001, + annexed_file_count=50 + 100 + 150, + ), + stats=AnnexDsCollectionStats( + ds_count=6, + annexed_files_size=1000 + 1001 + 400, + annexed_file_count=120 + 50 + 100 + 120 + 150 + 130, + ), + ), + pure_annex_ds_stats=AnnexDsCollectionStats( + ds_count=1, annexed_files_size=600, annexed_file_count=100 + ), + non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=1), + summary=StatsSummary(unique_ds_count=3, ds_count=9), + ), + ), + ( + {"search": "url:datalad"}, + CollectionStats( + datalad_ds_stats=DataladDsCollectionStats( + unique_ds_stats=AnnexDsCollectionStats( + ds_count=2, + annexed_files_size=1000 + 400, + annexed_file_count=120 + 50, + ), + stats=AnnexDsCollectionStats( + ds_count=2, + annexed_files_size=1000 + 400, + annexed_file_count=120 + 50, + ), + ), + pure_annex_ds_stats=AnnexDsCollectionStats( + ds_count=0, annexed_files_size=None, annexed_file_count=None + ), + non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=0), + summary=StatsSummary(unique_ds_count=2, ds_count=2), + ), + ), + ( + {"search": "url:.org"}, + CollectionStats( + datalad_ds_stats=DataladDsCollectionStats( + unique_ds_stats=AnnexDsCollectionStats( + ds_count=2, + annexed_files_size=1000 + 400, + annexed_file_count=120 + 50, + ), + stats=AnnexDsCollectionStats( + ds_count=2, + annexed_files_size=1000 + 400, + annexed_file_count=120 + 50, + ), + ), + pure_annex_ds_stats=AnnexDsCollectionStats( + ds_count=0, annexed_files_size=None, annexed_file_count=None + ), + non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=1), + summary=StatsSummary(unique_ds_count=2, ds_count=4), + ), + ), + ( + # === The case of an empty set of dataset URLs returned === + {"search": "url:.tv"}, + CollectionStats( + datalad_ds_stats=DataladDsCollectionStats( + unique_ds_stats=AnnexDsCollectionStats( + ds_count=0, + annexed_files_size=None, + annexed_file_count=None, + ), + stats=AnnexDsCollectionStats( + ds_count=0, + annexed_files_size=None, + annexed_file_count=None, + ), + ), + pure_annex_ds_stats=AnnexDsCollectionStats( + ds_count=0, annexed_files_size=None, annexed_file_count=None + ), + non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=0), + summary=StatsSummary(unique_ds_count=0, ds_count=0), + ), + ), + ( + {"search": "url:distribits.live"}, + CollectionStats( + datalad_ds_stats=DataladDsCollectionStats( + unique_ds_stats=AnnexDsCollectionStats( + ds_count=2, + annexed_files_size=1001, + annexed_file_count=100 + 150, + ), + stats=AnnexDsCollectionStats( + ds_count=4, + annexed_files_size=1001, + annexed_file_count=100 + 120 + 150 + 130, + ), + ), + pure_annex_ds_stats=AnnexDsCollectionStats( + ds_count=0, annexed_files_size=None, annexed_file_count=None + ), + non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=0), + summary=StatsSummary(unique_ds_count=2, ds_count=4), + ), + ), + ], + ) + def test_stats(self, query_params, expected_stats, flask_app, flask_client): + """ + Test the compilation of stats regarding the returned dataset URLs + """ + + # Populate the DB with dataset URLs suitable for testing the stats + urls = [ + RepoUrl( + url="https://www.example.com", + ds_id=None, + annexed_files_in_wt_count=100, + annexed_files_in_wt_size=600, + branches={ + "git-annex": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fd6db", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + } + }, + ), + RepoUrl( + url="http://www.datalad.org", + ds_id="2a0b7b7b-a984-4c4a-844c-be3132291a7c", + annexed_files_in_wt_count=120, + annexed_files_in_wt_size=1000, + branches={ + "git-annex": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fd6ta", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + } + }, + ), + RepoUrl( + url="https://handbook.datalad.org", + ds_id="2b73b99e-59cc-4f35-833a-69c75ca5b0c5", + annexed_files_in_wt_count=50, + annexed_files_in_wt_size=400, + branches={ + "git-annex": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fd6cc", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + } + }, + ), + RepoUrl( + url="https://www.dandiarchive.org", + ds_id=None, + annexed_files_in_wt_count=100, + annexed_files_in_wt_size=300, + ), + RepoUrl( + url="https://distribits.live", + ds_id="2a0b7b7b-a984-4c4a-844c-be3132291a7c", + annexed_files_in_wt_count=100, + annexed_files_in_wt_size=1001, + branches={ + "git-annex": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + } + }, + ), + RepoUrl( + url="https://distribits.live/1", + ds_id="48185fb3-aa80-47b4-8ab1-1d7d9fc8b192", + annexed_files_in_wt_count=120, + annexed_files_in_wt_size=None, + branches={ + "git-annex": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + } + }, + ), + RepoUrl( + url="https://distribits.live/2", + ds_id="48185fb3-aa80-47b4-8ab1-1d7d9fc8b192", + annexed_files_in_wt_count=150, + annexed_files_in_wt_size=None, + branches={ + "git-annex": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + } + }, + ), + RepoUrl( + url="https://distribits.live/3", + ds_id="48185fb3-aa80-47b4-8ab1-1d7d9fc8b192", + annexed_files_in_wt_count=130, + annexed_files_in_wt_size=None, + branches={ + "git-annex": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + } + }, + ), + RepoUrl( + url="https://centerforopenneuroscience.org", + ds_id=None, + annexed_files_in_wt_count=None, + annexed_files_in_wt_size=None, + branches={ + "main": { + "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + }, + "dev": { + "hexsha": "f21cff198ce84438bd60d459577401d7175fdaba", + "last_commit_dt": "2022-11-18T19:18:23+00:00", + }, + }, + ), + ] + populate_with_dataset_urls(urls, flask_app) + + resp = flask_client.get("/api/v2/dataset-urls", query_string=query_params) + + assert DatasetURLPage.parse_raw(resp.text).collection_stats == expected_stats + @pytest.mark.usefixtures("populate_with_2_dataset_urls") class TestDatasetURL: From 80df6ac67cf20c41ea6b7ac15803a8fe2f54edef Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 10:21:39 -0700 Subject: [PATCH 30/57] Remove grouping by `RepoUrl` This statement is not needed because `RepoUrl` objects are already unique --- datalad_registry/overview.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py index f8c5915f..a75ebf2d 100644 --- a/datalad_registry/overview.py +++ b/datalad_registry/overview.py @@ -49,7 +49,6 @@ def overview(): # No type hints due to mypy#7187. select_stmt = select_stmt.filter(criteria) # Sort - select_stmt = select_stmt.group_by(RepoUrl) sort_by = request.args.get("sort", default_sort_scheme, type=str) if sort_by not in _SORT_ATTRS: lgr.debug("Ignoring unknown sort parameter: %s", sort_by) From b124dc339a7eec02a95feec88cd74458d7c0aba3 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 10:25:27 -0700 Subject: [PATCH 31/57] Reorganize code with improved comments --- datalad_registry/overview.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py index a75ebf2d..27f03eeb 100644 --- a/datalad_registry/overview.py +++ b/datalad_registry/overview.py @@ -48,12 +48,14 @@ def overview(): # No type hints due to mypy#7187. else: select_stmt = select_stmt.filter(criteria) - # Sort + # Decipher sorting scheme sort_by = request.args.get("sort", default_sort_scheme, type=str) if sort_by not in _SORT_ATTRS: lgr.debug("Ignoring unknown sort parameter: %s", sort_by) sort_by = default_sort_scheme col, sort_method = _SORT_ATTRS[sort_by] + + # Apply sorting select_stmt = select_stmt.order_by( nullslast(getattr(getattr(RepoUrl, col), sort_method)()) ) From f0db49c38e342e1c9aa85fc678ac2c76520a2cb8 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 10:32:13 -0700 Subject: [PATCH 32/57] Capture the "base select statement" This statement exclude the ordering of the elements. Gathering stats using this statement results in simpler query. --- datalad_registry/overview.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py index 27f03eeb..4b99ac9a 100644 --- a/datalad_registry/overview.py +++ b/datalad_registry/overview.py @@ -32,7 +32,7 @@ def overview(): # No type hints due to mypy#7187. default_sort_scheme = "update-desc" - select_stmt = select(RepoUrl) + base_select_stmt = select(RepoUrl) # Search using query if provided. # ATM it is just a 'filter' on URL records, later might be more complex @@ -46,7 +46,7 @@ def overview(): # No type hints due to mypy#7187. except Exception as e: search_error = str(e) else: - select_stmt = select_stmt.filter(criteria) + base_select_stmt = base_select_stmt.filter(criteria) # Decipher sorting scheme sort_by = request.args.get("sort", default_sort_scheme, type=str) @@ -56,7 +56,7 @@ def overview(): # No type hints due to mypy#7187. col, sort_method = _SORT_ATTRS[sort_by] # Apply sorting - select_stmt = select_stmt.order_by( + select_stmt = base_select_stmt.order_by( nullslast(getattr(getattr(RepoUrl, col), sort_method)()) ) From 177db1651de86ba3f2ed04e13309aec8dc99388d Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 10:40:12 -0700 Subject: [PATCH 33/57] Gather returned dataset collection stats and pass it to web UI --- datalad_registry/overview.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py index 4b99ac9a..4e151abd 100644 --- a/datalad_registry/overview.py +++ b/datalad_registry/overview.py @@ -6,6 +6,7 @@ from flask import Blueprint, render_template, request from sqlalchemy import nullslast, select +from datalad_registry.blueprints.api.dataset_urls.tools import get_collection_stats from datalad_registry.models import RepoUrl, db from datalad_registry.search import parse_query @@ -63,9 +64,13 @@ def overview(): # No type hints due to mypy#7187. # Paginate pagination = db.paginate(select_stmt) + # Gather stats of the returned collection of datasets + stats = get_collection_stats(base_select_stmt) + return render_template( "overview.html", pagination=pagination, + stats=stats, sort_by=sort_by, search_query=query, search_error=search_error, From 7581a3eb097fcedfb9888d50b5bd9a14f2ff1a62 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 11:49:30 -0700 Subject: [PATCH 34/57] Fix importing of `OperationMode` The current import directly from `datalad_registry` causes circulating import error --- datalad_registry/blueprints/api/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_registry/blueprints/api/utils.py b/datalad_registry/blueprints/api/utils.py index f8265727..41422e1d 100644 --- a/datalad_registry/blueprints/api/utils.py +++ b/datalad_registry/blueprints/api/utils.py @@ -3,7 +3,7 @@ from flask import current_app, request -from datalad_registry import OperationMode +from datalad_registry.conf import OperationMode from datalad_registry.utils.flask_tools import json_resp_from_str from . import HTTPExceptionResp From 8d160a3f7877b85bef16b26af5756342b57ab4b2 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 13:22:21 -0700 Subject: [PATCH 35/57] Provide stats at the end of web UI --- datalad_registry/templates/overview.html | 68 ++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html index d2287ce8..dea5961e 100644 --- a/datalad_registry/templates/overview.html +++ b/datalad_registry/templates/overview.html @@ -178,6 +178,74 @@

Search query syntax

{{ render_pagination_widget(pagination, '.overview') }}
+ + +
+

Stats

+
    +
  • +

    Datalad Datasets Stats

    +
      +
    • +

      Unique Datalad Dataset Stats

      +
        +
      • Count: {{ stats.datalad_ds_stats.unique_ds_stats.ds_count }}
      • + {% if stats.datalad_ds_stats.unique_ds_stats.annexed_file_count is not none %} +
      • Annexed file + count: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_file_count }}
      • + {% endif %} + {% if stats.datalad_ds_stats.unique_ds_stats.annexed_files_size is not none %} +
      • Annexed files + size: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_files_size }}
      • + {% endif %} +
      +
    • +
    • +

      Stats without deduplication

      +
        +
      • Count: {{ stats.datalad_ds_stats.stats.ds_count }}
      • + {% if stats.datalad_ds_stats.stats.annexed_file_count is not none %} +
      • Annexed file + count: {{ stats.datalad_ds_stats.stats.annexed_file_count }}
      • + {% endif %} + {% if stats.datalad_ds_stats.stats.annexed_files_size is not none %} +
      • Annexed files + size: {{ stats.datalad_ds_stats.stats.annexed_files_size }}
      • + {% endif %} +
      +
    • +
    +
  • +
  • +

    Pure Annex Dataset Stats

    +
      +
    • Count: {{ stats.pure_annex_ds_stats.ds_count }}
    • + {% if stats.pure_annex_ds_stats.annexed_file_count is not none %} +
    • Annexed file + count: {{ stats.pure_annex_ds_stats.annexed_file_count }}
    • + {% endif %} + {% if stats.pure_annex_ds_stats.annexed_files_size is not none %} +
    • Annexed files + size: {{ stats.pure_annex_ds_stats.annexed_files_size }}
    • + {% endif %} +
    +
  • +
  • +

    Non-Annex Dataset Stats

    +
      +
    • Count: {{ stats.non_annex_ds_stats.ds_count }}
    • +
    +
  • +
  • +

    Summary

    +
      +
    • Unique dataset count: {{ stats.summary.unique_ds_count }}
    • +
    • Total dataset count (without + deduplication): {{ stats.summary.ds_count }}
    • +
    +
  • +
+
From 69490edd64020c7c5be948e442906875e9550b3f Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 13:41:53 -0700 Subject: [PATCH 36/57] RF: Use a macro to render stats for annex dataset collection --- datalad_registry/templates/overview.html | 50 ++++++++---------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html index dea5961e..685ffb10 100644 --- a/datalad_registry/templates/overview.html +++ b/datalad_registry/templates/overview.html @@ -27,6 +27,20 @@ {% endmacro %} +{% macro render_annex_ds_collection_stats(annex_ds_col_stats) %} +
    +
  • Count: {{ annex_ds_col_stats.ds_count }}
  • + {% if annex_ds_col_stats.annexed_file_count is not none %} +
  • Annexed file + count: {{ annex_ds_col_stats.annexed_file_count }}
  • + {% endif %} + {% if annex_ds_col_stats.annexed_files_size is not none %} +
  • Annexed files + size: {{ annex_ds_col_stats.annexed_files_size }}
  • + {% endif %} +
+{% endmacro %} + @@ -188,47 +202,17 @@

Datalad Datasets Stats

  • Unique Datalad Dataset Stats

    -
      -
    • Count: {{ stats.datalad_ds_stats.unique_ds_stats.ds_count }}
    • - {% if stats.datalad_ds_stats.unique_ds_stats.annexed_file_count is not none %} -
    • Annexed file - count: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_file_count }}
    • - {% endif %} - {% if stats.datalad_ds_stats.unique_ds_stats.annexed_files_size is not none %} -
    • Annexed files - size: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_files_size }}
    • - {% endif %} -
    + {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.unique_ds_stats) }}
  • Stats without deduplication

    -
      -
    • Count: {{ stats.datalad_ds_stats.stats.ds_count }}
    • - {% if stats.datalad_ds_stats.stats.annexed_file_count is not none %} -
    • Annexed file - count: {{ stats.datalad_ds_stats.stats.annexed_file_count }}
    • - {% endif %} - {% if stats.datalad_ds_stats.stats.annexed_files_size is not none %} -
    • Annexed files - size: {{ stats.datalad_ds_stats.stats.annexed_files_size }}
    • - {% endif %} -
    + {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.stats) }}
  • Pure Annex Dataset Stats

    -
      -
    • Count: {{ stats.pure_annex_ds_stats.ds_count }}
    • - {% if stats.pure_annex_ds_stats.annexed_file_count is not none %} -
    • Annexed file - count: {{ stats.pure_annex_ds_stats.annexed_file_count }}
    • - {% endif %} - {% if stats.pure_annex_ds_stats.annexed_files_size is not none %} -
    • Annexed files - size: {{ stats.pure_annex_ds_stats.annexed_files_size }}
    • - {% endif %} -
    + {{ render_annex_ds_collection_stats(stats.pure_annex_ds_stats) }}
  • Non-Annex Dataset Stats

    From 4ea58920593dcc96b9d3391bbd94cb654071bd09 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 14:13:47 -0700 Subject: [PATCH 37/57] Add stats trigger button --- datalad_registry/templates/overview.html | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html index 685ffb10..8ac368b6 100644 --- a/datalad_registry/templates/overview.html +++ b/datalad_registry/templates/overview.html @@ -193,6 +193,9 @@

    Search query syntax

    {{ render_pagination_widget(pagination, '.overview') }} + + +

    Stats

    From eb97bafe18a99e0c208cd42efd8d18625dd2e8d4 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 14:34:18 -0700 Subject: [PATCH 38/57] Enclose stats in a `modal` element --- datalad_registry/templates/overview.html | 66 +++++++++++++----------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html index 8ac368b6..a01c988c 100644 --- a/datalad_registry/templates/overview.html +++ b/datalad_registry/templates/overview.html @@ -197,41 +197,49 @@

    Search query syntax

    -
    -

    Stats

    -
      -
    • -

      Datalad Datasets Stats

      +
    • -
    • -

      Pure Annex Dataset Stats

      - {{ render_annex_ds_collection_stats(stats.pure_annex_ds_stats) }} -
    • -
    • -

      Non-Annex Dataset Stats

      -
        -
      • Count: {{ stats.non_annex_ds_stats.ds_count }}
      • -
      -
    • -
    • -

      Summary

      -
        -
      • Unique dataset count: {{ stats.summary.unique_ds_count }}
      • -
      • Total dataset count (without - deduplication): {{ stats.summary.ds_count }}
      • -
      -
    • -
    +
    +
    From 1711619a6fc3b2a0b8a1fa522d4186fc3f8da17c Mon Sep 17 00:00:00 2001 From: Isaac To Date: Fri, 19 Apr 2024 17:00:00 -0700 Subject: [PATCH 39/57] Rename html elements and improve comment So new similar elements can be distinguished --- datalad_registry/templates/overview.html | 28 ++++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html index a01c988c..5b3ec8f7 100644 --- a/datalad_registry/templates/overview.html +++ b/datalad_registry/templates/overview.html @@ -245,29 +245,29 @@

    Summary