diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py index 85005609..cff6f476 100644 --- a/datalad_registry/blueprints/api/dataset_urls/__init__.py +++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py @@ -9,7 +9,7 @@ from flask_openapi3 import APIBlueprint, Tag from lark.exceptions import GrammarError, UnexpectedInput from psycopg2.errors import UniqueViolation -from sqlalchemy import ColumnElement, and_ +from sqlalchemy import ColumnElement, and_, select from sqlalchemy.exc import IntegrityError from datalad_registry.models import RepoUrl, db @@ -32,6 +32,7 @@ PathParams, QueryParams, ) +from .tools import get_collection_stats from .. import ( API_URL_PREFIX, COMMON_API_RESPONSES, @@ -92,7 +93,7 @@ def declare_dataset_url(body: DatasetURLSubmitModel): url_as_str = str(body.url) repo_url_row = db.session.execute( - db.select(RepoUrl).filter_by(url=url_as_str) + select(RepoUrl).filter_by(url=url_as_str) ).one_or_none() if repo_url_row is None: # == The URL requested to be created does not exist in the database == @@ -116,7 +117,7 @@ def declare_dataset_url(body: DatasetURLSubmitModel): # of the URL in the database db.session.rollback() repo_url_added_by_another = ( - db.session.execute(db.select(RepoUrl).filter_by(url=url_as_str)) + db.session.execute(select(RepoUrl).filter_by(url=url_as_str)) .scalars() .one_or_none() ) @@ -285,11 +286,11 @@ def cache_path_trans(cache_path: Path) -> str: ep = ".dataset_urls" # Endpoint of `dataset_urls` base_qry = loads(query.json(exclude={"page"}, exclude_none=True)) + base_select_stmt = select(RepoUrl).filter(and_(True, *constraints)) + max_per_page = 100 # The overriding limit to `per_page` provided by the requester pagination = db.paginate( - db.select(RepoUrl) - .filter(and_(True, *constraints)) - .order_by( + base_select_stmt.order_by( getattr( _ORDER_KEY_TO_SQLA_ATTR[query.order_by], query.order_dir.value )().nulls_last() @@ -341,7 +342,6 @@ def cache_path_trans(cache_path: Path) -> str: assert pagination.total is not None page = DatasetURLPage( - total=pagination.total, cur_pg_num=cur_pg_num, prev_pg=( url_for(ep, **base_qry, page=pagination.prev_num) @@ -356,6 +356,7 @@ def cache_path_trans(cache_path: Path) -> str: first_pg=url_for(ep, **base_qry, page=1), last_pg=url_for(ep, **base_qry, page=1 if total_pages == 0 else total_pages), dataset_urls=ds_urls, + collection_stats=get_collection_stats(base_select_stmt), ) return json_resp_from_str(page.json(exclude_none=True)) diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py index ca000424..d6f3773b 100644 --- a/datalad_registry/blueprints/api/dataset_urls/models.py +++ b/datalad_registry/blueprints/api/dataset_urls/models.py @@ -9,6 +9,7 @@ BaseModel, Field, FileUrl, + NonNegativeInt, PositiveInt, StrictInt, StrictStr, @@ -256,14 +257,74 @@ class Config: by_alias = False +class AnnexDsCollectionStats(BaseModel): + """ + Model with the base components of annex dataset collection statistics + """ + + ds_count: NonNegativeInt = Field(description="The number of datasets") + annexed_files_size: Optional[NonNegativeInt] = Field( + None, description="The size of annexed files" + ) + annexed_file_count: Optional[NonNegativeInt] = Field( + None, description="The number of annexed files" + ) + + +class DataladDsCollectionStats(BaseModel): + """ + Model for DataLad dataset collection statistics + """ + + unique_ds_stats: AnnexDsCollectionStats = Field( + description="Statistics for unique datasets" + ) + stats: AnnexDsCollectionStats = Field( + description="Statistics for all datasets, as individual repos, " + "without any deduplication" + ) + + +class NonAnnexDsCollectionStats(BaseModel): + """ + Model for non-annex dataset collection statistics + """ + + ds_count: NonNegativeInt = Field( + description="The number of datasets, as individual repos, " + "without any deduplication" + ) + + +class StatsSummary(BaseModel): + unique_ds_count: NonNegativeInt = Field(description="The number of unique datasets") + ds_count: NonNegativeInt = Field( + description="The number of datasets, as individual repos, " + "without any deduplication" + ) + + +class CollectionStats(BaseModel): + datalad_ds_stats: DataladDsCollectionStats = Field( + description="Statistics for DataLad datasets" + ) + pure_annex_ds_stats: AnnexDsCollectionStats = Field( + description="Statistics for pure annex datasets, as individual repos, " + "without any deduplication" + ) + non_annex_ds_stats: NonAnnexDsCollectionStats = Field( + description="Statistics for non-annex datasets, as individual repos, " + "without any deduplication" + ) + + summary: StatsSummary = Field(description="Summary statistics") + + class DatasetURLPage(BaseModel): """ Model for representing a page of dataset URLs in response communication """ - total: StrictInt = Field( - description="The total number of dataset URLs across all pages" - ) cur_pg_num: StrictInt = Field(description="The number of the current page") prev_pg: Optional[StrictStr] = Field( None, description="The link to the previous page" @@ -275,3 +336,8 @@ class DatasetURLPage(BaseModel): dataset_urls: list[DatasetURLRespModel] = Field( description="The list of dataset URLs in the current page" ) + collection_stats: CollectionStats = Field( + description="Statistics about the collection of dataset URLs, " + "not just the URLs in the current page but the entire collection " + "returned" + ) diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py new file mode 100644 index 00000000..6d518cb1 --- /dev/null +++ b/datalad_registry/blueprints/api/dataset_urls/tools.py @@ -0,0 +1,238 @@ +from sqlalchemy import ( + Select, + Subquery, + TableClause, + and_, + column, + func, + not_, + or_, + select, + table, + text, +) + +from datalad_registry.models import RepoUrl, db + +from .models import ( + AnnexDsCollectionStats, + CollectionStats, + DataladDsCollectionStats, + NonAnnexDsCollectionStats, + StatsSummary, +) + + +def cache_result_to_tmp_tb(select_stmt: Select, tb_name: str) -> TableClause: + """ + Execute the given select statement and cache the result to a temporary table + with the given name + + :param select_stmt: The given select statement to execute + :param tb_name: The string to use as the name of the temporary table + :return: A object representing the temporary table + + Note: The execution of this function requires the Flask app's context + """ + create_tmp_tb_sql = f""" + CREATE TEMPORARY TABLE {tb_name} AS + {select_stmt.compile(bind=db.engine, compile_kwargs={'literal_binds': True})}; + """ + db.session.execute(text(create_tmp_tb_sql)) + + return table( + tb_name, + *(column(name, c.type) for name, c in select_stmt.selected_columns.items()), + ) + + +def _get_annex_ds_collection_stats(q: Subquery) -> AnnexDsCollectionStats: + """ + Get the stats of a collection of datasets that contains only of annex datasets + + :param q: The query that specifies the collection of datasets under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + ds_count, annexed_files_size, annexed_file_count = db.session.execute( + select( + func.count().label("ds_count"), + func.sum(q.c.annexed_files_in_wt_size).label("annexed_files_size"), + func.sum(q.c.annexed_files_in_wt_count).label("annexed_file_count"), + ).select_from(q) + ).one() + + return AnnexDsCollectionStats( + ds_count=ds_count, + annexed_files_size=annexed_files_size, + annexed_file_count=annexed_file_count, + ) + + +def get_unique_dl_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of Datalad datasets, considering datasets with the same `ds_id` as the same + dataset + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + grp_by_id_q = ( + select( + base_q.c.ds_id, + func.max(base_q.c.annexed_files_in_wt_size).label( + "max_annexed_files_in_wt_size" + ), + ) + .group_by(base_q.c.ds_id) + .subquery("grp_by_id_q") + ) + + grp_by_id_and_a_f_size_q = ( + select( + RepoUrl.ds_id, + RepoUrl.annexed_files_in_wt_size, + func.max(RepoUrl.annexed_files_in_wt_count).label( + "annexed_files_in_wt_count" + ), + ) + .join( + grp_by_id_q, + and_( + RepoUrl.ds_id == grp_by_id_q.c.ds_id, + or_( + grp_by_id_q.c.max_annexed_files_in_wt_size.is_(None), + RepoUrl.annexed_files_in_wt_size + == grp_by_id_q.c.max_annexed_files_in_wt_size, + ), + ), + ) + .group_by(RepoUrl.ds_id, RepoUrl.annexed_files_in_wt_size) + .subquery("grp_by_id_and_a_f_size_q") + ) + + return _get_annex_ds_collection_stats(grp_by_id_and_a_f_size_q) + + +def get_dl_ds_collection_stats_with_dups(base_q: Subquery) -> AnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of Datalad datasets, considering individual repos as a dataset regardless of + the value of `ds_id`. + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + # Select statement for getting all the Datalad datasets + dl_ds_q = select(base_q).filter(base_q.c.ds_id.is_not(None)).subquery("dl_ds_q") + + return _get_annex_ds_collection_stats(dl_ds_q) + + +def get_dl_ds_collection_stats(base_q: Subquery) -> DataladDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of Datalad datasets + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + + return DataladDsCollectionStats( + unique_ds_stats=get_unique_dl_ds_collection_stats(base_q), + stats=get_dl_ds_collection_stats_with_dups(base_q), + ) + + +def get_pure_annex_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of pure annex datasets, the annex datasets that are not Datalad datasets + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + # Select statement for getting all the pure annex datasets + pure_annex_ds_q = ( + select(base_q) + .filter(and_(base_q.c.branches.has_key("git-annex"), base_q.c.ds_id.is_(None))) + .subquery("pure_annex_ds_q") + ) + + return _get_annex_ds_collection_stats(pure_annex_ds_q) + + +def get_non_annex_ds_collection_stats(base_q: Subquery) -> NonAnnexDsCollectionStats: + """ + Get the stats of the subset of the collection of datasets that contains only + of non-annex datasets + + :param base_q: The base query that specified the collection of datasets + under consideration + :return: The object representing the stats + + Note: The execution of this function requires the Flask app's context + """ + # Select statement for getting all the non-annex datasets + non_annex_ds_q = ( + select(base_q) + .filter(not_(base_q.c.branches.has_key("git-annex"))) + .subquery("non_annex_ds_q") + ) + + return NonAnnexDsCollectionStats( + ds_count=db.session.execute( + select(func.count().label("ds_count")).select_from(non_annex_ds_q) + ).scalar_one() + ) + + +def get_collection_stats(select_stmt: Select) -> CollectionStats: + """ + Get the statistics of the collection of dataset URLs specified by the given select + statement + + :param select_stmt: The given select statement + :return: The statistics of the collection of dataset URLs + + Note: The execution of this function requires the Flask app's context + """ + + # Cache the result of the select statement to a temporary table + tmp_tb = cache_result_to_tmp_tb(select_stmt, "tmp_tb") + + # base_q = select_stmt.subquery("base_q") + base_q = select(tmp_tb).subquery("base_q") + + datalad_ds_stats = get_dl_ds_collection_stats(base_q) + + # Total number of datasets, as individual repos, without any deduplication + ds_count = db.session.execute( + select(func.count().label("ds_count")).select_from(base_q) + ).scalar_one() + + return CollectionStats( + datalad_ds_stats=datalad_ds_stats, + pure_annex_ds_stats=get_pure_annex_ds_collection_stats(base_q), + non_annex_ds_stats=get_non_annex_ds_collection_stats(base_q), + summary=StatsSummary( + unique_ds_count=datalad_ds_stats.unique_ds_stats.ds_count, ds_count=ds_count + ), + ) diff --git a/datalad_registry/blueprints/api/utils.py b/datalad_registry/blueprints/api/utils.py index f8265727..41422e1d 100644 --- a/datalad_registry/blueprints/api/utils.py +++ b/datalad_registry/blueprints/api/utils.py @@ -3,7 +3,7 @@ from flask import current_app, request -from datalad_registry import OperationMode +from datalad_registry.conf import OperationMode from datalad_registry.utils.flask_tools import json_resp_from_str from . import HTTPExceptionResp diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py index f8c5915f..7aadcc39 100644 --- a/datalad_registry/overview.py +++ b/datalad_registry/overview.py @@ -4,8 +4,10 @@ import logging from flask import Blueprint, render_template, request +from humanize import intcomma from sqlalchemy import nullslast, select +from datalad_registry.blueprints.api.dataset_urls.tools import get_collection_stats from datalad_registry.models import RepoUrl, db from datalad_registry.search import parse_query @@ -28,11 +30,15 @@ } +# Register humanize.intcomma as a Jinja2 filter +bp.add_app_template_filter(intcomma, "intcomma") + + @bp.get("/") def overview(): # No type hints due to mypy#7187. default_sort_scheme = "update-desc" - select_stmt = select(RepoUrl) + base_select_stmt = select(RepoUrl) # Search using query if provided. # ATM it is just a 'filter' on URL records, later might be more complex @@ -46,25 +52,30 @@ def overview(): # No type hints due to mypy#7187. except Exception as e: search_error = str(e) else: - select_stmt = select_stmt.filter(criteria) + base_select_stmt = base_select_stmt.filter(criteria) - # Sort - select_stmt = select_stmt.group_by(RepoUrl) + # Decipher sorting scheme sort_by = request.args.get("sort", default_sort_scheme, type=str) if sort_by not in _SORT_ATTRS: lgr.debug("Ignoring unknown sort parameter: %s", sort_by) sort_by = default_sort_scheme col, sort_method = _SORT_ATTRS[sort_by] - select_stmt = select_stmt.order_by( + + # Apply sorting + select_stmt = base_select_stmt.order_by( nullslast(getattr(getattr(RepoUrl, col), sort_method)()) ) # Paginate pagination = db.paginate(select_stmt) + # Gather stats of the returned collection of datasets + stats = get_collection_stats(base_select_stmt) + return render_template( "overview.html", pagination=pagination, + stats=stats, sort_by=sort_by, search_query=query, search_error=search_error, diff --git a/datalad_registry/static/main.css b/datalad_registry/static/main.css index 3c24aedd..06ba9fc4 100644 --- a/datalad_registry/static/main.css +++ b/datalad_registry/static/main.css @@ -120,7 +120,7 @@ div#datalad-registry div.pagination { /* Modal Content */ .modal-content { position: fixed; - bottom: 50%; + top: 10%; background-color: #fefefe; width: 100%; -webkit-animation-name: slideIn; @@ -131,13 +131,25 @@ div#datalad-registry div.pagination { /* Add Animation */ @-webkit-keyframes slideIn { - from {bottom: -300px; opacity: 0} - to {bottom: 50%; opacity: 1} + from { + top: 50%; + opacity: 0 + } + to { + top: 10%; + opacity: 1 + } } @keyframes slideIn { - from {bottom: -300px; opacity: 0} - to {bottom: 50%; opacity: 1} + from { + top: 50%; + opacity: 0 + } + to { + top: 10%; + opacity: 1 + } } @-webkit-keyframes fadeIn { diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html index 75f621bb..45f255b0 100644 --- a/datalad_registry/templates/overview.html +++ b/datalad_registry/templates/overview.html @@ -27,6 +27,20 @@ {% endmacro %} +{% macro render_annex_ds_collection_stats(annex_ds_col_stats) %} +