From c9e44b2162344e4ee0d0e027be2042ec8b0584d2 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Sat, 6 Apr 2024 12:05:56 +0200
Subject: [PATCH 01/57] Include collection stats in `DatasetURLPage`

---
 datalad_registry/blueprints/api/dataset_urls/models.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index ca000424..20d1a4b0 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -256,6 +256,10 @@ class Config:
         by_alias = False
 
 
+class CollectionStats(BaseModel):
+    pass
+
+
 class DatasetURLPage(BaseModel):
     """
     Model for representing a page of dataset URLs in response communication
@@ -275,3 +279,8 @@ class DatasetURLPage(BaseModel):
     dataset_urls: list[DatasetURLRespModel] = Field(
         description="The list of dataset URLs in the current page"
     )
+    collection_stats: CollectionStats = Field(
+        description="Statistics about the collection of dataset URLs, "
+        "not just the URLs in the current page but the entire collection "
+        "returned"
+    )

From 6850ec6445e620d7dd9d6d4556df6650fe5d1f4b Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Sat, 6 Apr 2024 15:47:54 +0200
Subject: [PATCH 02/57] Define models for representing stats of filter
 collections

---
 .../blueprints/api/dataset_urls/models.py     | 47 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 20d1a4b0..2ac6bcbd 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -9,6 +9,7 @@
     BaseModel,
     Field,
     FileUrl,
+    NonNegativeInt,
     PositiveInt,
     StrictInt,
     StrictStr,
@@ -256,10 +257,54 @@ class Config:
         by_alias = False
 
 
-class CollectionStats(BaseModel):
+class _AnnexDsCollectionStats(BaseModel):
+    """
+    Model with the base components of annex dataset collection statistics
+    """
+
+    ds_count: NonNegativeInt = Field(description="The number of datasets")
+    size_of_annexed_files: NonNegativeInt = Field(
+        description="The size of annexed files"
+    )
+    annexed_file_count: NonNegativeInt = Field(
+        description="The number of annexed files"
+    )
+
+
+class AnnexDsCollectionStats(BaseModel):
+    """
+    Model for annex dataset collection statistics
+    """
+
+    unique_ds_stats: _AnnexDsCollectionStats = Field(
+        description="Statistics for unique datasets"
+    )
+    stats: _AnnexDsCollectionStats = Field(description="Statistics for all datasets")
+
+
+class NonAnnexDsStats(BaseModel):
     pass
 
 
+class StatsSummary(BaseModel):
+    unique_ds_count: NonNegativeInt = Field(description="The number of unique datasets")
+    ds_count: NonNegativeInt = Field(description="The number of datasets")
+
+
+class CollectionStats(BaseModel):
+    datalad_ds_stats: AnnexDsCollectionStats = Field(
+        description="Statistics for DataLad datasets"
+    )
+    pure_annex_ds_stats: AnnexDsCollectionStats = Field(
+        description="Statistics for pure annex datasets"
+    )
+    non_annex_ds_stats: NonAnnexDsStats = Field(
+        description="Statistics for non-annex datasets"
+    )
+
+    summary: StatsSummary = Field(description="Summary statistics")
+
+
 class DatasetURLPage(BaseModel):
     """
     Model for representing a page of dataset URLs in response communication

From 1c84c51b25d52b5fab496c5c73f475827ea75986 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Tue, 9 Apr 2024 18:10:37 -0700
Subject: [PATCH 03/57] Extract base select statement

So that it can be used to calculate statistics
---
 datalad_registry/blueprints/api/dataset_urls/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py
index 85005609..9adbfc85 100644
--- a/datalad_registry/blueprints/api/dataset_urls/__init__.py
+++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py
@@ -285,11 +285,11 @@ def cache_path_trans(cache_path: Path) -> str:
     ep = ".dataset_urls"  # Endpoint of `dataset_urls`
     base_qry = loads(query.json(exclude={"page"}, exclude_none=True))
 
+    base_select = db.select(RepoUrl).filter(and_(True, *constraints))
+
     max_per_page = 100  # The overriding limit to `per_page` provided by the requester
     pagination = db.paginate(
-        db.select(RepoUrl)
-        .filter(and_(True, *constraints))
-        .order_by(
+        base_select.order_by(
             getattr(
                 _ORDER_KEY_TO_SQLA_ATTR[query.order_by], query.order_dir.value
             )().nulls_last()

From 11954acf8d935f75c3f06cd620d57f80ee334c05 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Tue, 9 Apr 2024 18:18:20 -0700
Subject: [PATCH 04/57] Replace `db.select` with `sqlalchemy.select`

They are essentially the same object but
`sqlalchemy.select` comes with better typing support
---
 datalad_registry/blueprints/api/dataset_urls/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py
index 9adbfc85..acba9458 100644
--- a/datalad_registry/blueprints/api/dataset_urls/__init__.py
+++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py
@@ -9,7 +9,7 @@
 from flask_openapi3 import APIBlueprint, Tag
 from lark.exceptions import GrammarError, UnexpectedInput
 from psycopg2.errors import UniqueViolation
-from sqlalchemy import ColumnElement, and_
+from sqlalchemy import ColumnElement, and_, select
 from sqlalchemy.exc import IntegrityError
 
 from datalad_registry.models import RepoUrl, db
@@ -92,7 +92,7 @@ def declare_dataset_url(body: DatasetURLSubmitModel):
     url_as_str = str(body.url)
 
     repo_url_row = db.session.execute(
-        db.select(RepoUrl).filter_by(url=url_as_str)
+        select(RepoUrl).filter_by(url=url_as_str)
     ).one_or_none()
     if repo_url_row is None:
         # == The URL requested to be created does not exist in the database ==
@@ -116,7 +116,7 @@ def declare_dataset_url(body: DatasetURLSubmitModel):
                     # of the URL in the database
                     db.session.rollback()
                     repo_url_added_by_another = (
-                        db.session.execute(db.select(RepoUrl).filter_by(url=url_as_str))
+                        db.session.execute(select(RepoUrl).filter_by(url=url_as_str))
                         .scalars()
                         .one_or_none()
                     )
@@ -285,7 +285,7 @@ def cache_path_trans(cache_path: Path) -> str:
     ep = ".dataset_urls"  # Endpoint of `dataset_urls`
     base_qry = loads(query.json(exclude={"page"}, exclude_none=True))
 
-    base_select = db.select(RepoUrl).filter(and_(True, *constraints))
+    base_select = select(RepoUrl).filter(and_(True, *constraints))
 
     max_per_page = 100  # The overriding limit to `per_page` provided by the requester
     pagination = db.paginate(

From 28d1d08469dcded189d3982d0596a1462e0510bc Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Tue, 9 Apr 2024 18:39:50 -0700
Subject: [PATCH 05/57] Rename `base_select` to `base_select_stmt`

For consistency
---
 datalad_registry/blueprints/api/dataset_urls/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py
index acba9458..0f16dfb3 100644
--- a/datalad_registry/blueprints/api/dataset_urls/__init__.py
+++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py
@@ -285,11 +285,11 @@ def cache_path_trans(cache_path: Path) -> str:
     ep = ".dataset_urls"  # Endpoint of `dataset_urls`
     base_qry = loads(query.json(exclude={"page"}, exclude_none=True))
 
-    base_select = select(RepoUrl).filter(and_(True, *constraints))
+    base_select_stmt = select(RepoUrl).filter(and_(True, *constraints))
 
     max_per_page = 100  # The overriding limit to `per_page` provided by the requester
     pagination = db.paginate(
-        base_select.order_by(
+        base_select_stmt.order_by(
             getattr(
                 _ORDER_KEY_TO_SQLA_ATTR[query.order_by], query.order_dir.value
             )().nulls_last()

From 2a1859d80054e410531dac116d9faa072ee6524e Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Tue, 9 Apr 2024 18:51:39 -0700
Subject: [PATCH 06/57] Define func for collection stats and utilize it

---
 .../blueprints/api/dataset_urls/__init__.py       | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py
index 0f16dfb3..e0dcb0fe 100644
--- a/datalad_registry/blueprints/api/dataset_urls/__init__.py
+++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py
@@ -9,7 +9,7 @@
 from flask_openapi3 import APIBlueprint, Tag
 from lark.exceptions import GrammarError, UnexpectedInput
 from psycopg2.errors import UniqueViolation
-from sqlalchemy import ColumnElement, and_, select
+from sqlalchemy import ColumnElement, Select, and_, select
 from sqlalchemy.exc import IntegrityError
 
 from datalad_registry.models import RepoUrl, db
@@ -23,6 +23,7 @@
 from datalad_registry.utils.flask_tools import json_resp_from_str
 
 from .models import (
+    CollectionStats,
     DatasetURLPage,
     DatasetURLRespBaseModel,
     DatasetURLRespModel,
@@ -179,6 +180,17 @@ def declare_dataset_url(body: DatasetURLSubmitModel):
         return json_resp_from_str(resp_model, status=202)
 
 
+def get_collection_stats(select_stmt: Select) -> CollectionStats:
+    """
+    Get the statistics of the collection of dataset URLs specified by the given select
+    statement
+
+    :param select_stmt: The given select statement
+    :return: The statistics of the collection of dataset URLs
+    """
+    pass
+
+
 @bp.get("", responses={"200": DatasetURLPage, "400": HTTPExceptionResp})
 def dataset_urls(query: QueryParams):
     """
@@ -356,6 +368,7 @@ def cache_path_trans(cache_path: Path) -> str:
         first_pg=url_for(ep, **base_qry, page=1),
         last_pg=url_for(ep, **base_qry, page=1 if total_pages == 0 else total_pages),
         dataset_urls=ds_urls,
+        collection_stats=get_collection_stats(base_select_stmt),
     )
 
     return json_resp_from_str(page.json(exclude_none=True))

From 412ce63e1b83b6724e1afb8204ed6a339ba30b93 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 13:39:41 -0700
Subject: [PATCH 07/57] Rename `AnnexDsCollectionStats` to
 `AnnexDsCollectionStats`

This model is more appropriate for representing the stats
of DataLad dataset collection
---
 datalad_registry/blueprints/api/dataset_urls/models.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 2ac6bcbd..ef26e3a1 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -271,9 +271,9 @@ class _AnnexDsCollectionStats(BaseModel):
     )
 
 
-class AnnexDsCollectionStats(BaseModel):
+class DataladDsCollectionStats(BaseModel):
     """
-    Model for annex dataset collection statistics
+    Model for DataLad dataset collection statistics
     """
 
     unique_ds_stats: _AnnexDsCollectionStats = Field(
@@ -292,10 +292,10 @@ class StatsSummary(BaseModel):
 
 
 class CollectionStats(BaseModel):
-    datalad_ds_stats: AnnexDsCollectionStats = Field(
+    datalad_ds_stats: DataladDsCollectionStats = Field(
         description="Statistics for DataLad datasets"
     )
-    pure_annex_ds_stats: AnnexDsCollectionStats = Field(
+    pure_annex_ds_stats: DataladDsCollectionStats = Field(
         description="Statistics for pure annex datasets"
     )
     non_annex_ds_stats: NonAnnexDsStats = Field(

From 4eabb09f1d6d74dba62c3c23f9c841ed8879902a Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 13:41:35 -0700
Subject: [PATCH 08/57] Rename `_AnnexDsCollectionStats` to
 `AnnexDsCollectionStats`

---
 datalad_registry/blueprints/api/dataset_urls/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index ef26e3a1..17f852a6 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -257,7 +257,7 @@ class Config:
         by_alias = False
 
 
-class _AnnexDsCollectionStats(BaseModel):
+class AnnexDsCollectionStats(BaseModel):
     """
     Model with the base components of annex dataset collection statistics
     """
@@ -276,10 +276,10 @@ class DataladDsCollectionStats(BaseModel):
     Model for DataLad dataset collection statistics
     """
 
-    unique_ds_stats: _AnnexDsCollectionStats = Field(
+    unique_ds_stats: AnnexDsCollectionStats = Field(
         description="Statistics for unique datasets"
     )
-    stats: _AnnexDsCollectionStats = Field(description="Statistics for all datasets")
+    stats: AnnexDsCollectionStats = Field(description="Statistics for all datasets")
 
 
 class NonAnnexDsStats(BaseModel):

From 77b81b1c784828430912eb54c21672f40f138b21 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 13:58:47 -0700
Subject: [PATCH 09/57] Update description `DataladDsCollectionStats.stats`

---
 datalad_registry/blueprints/api/dataset_urls/models.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 17f852a6..59344f60 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -279,7 +279,10 @@ class DataladDsCollectionStats(BaseModel):
     unique_ds_stats: AnnexDsCollectionStats = Field(
         description="Statistics for unique datasets"
     )
-    stats: AnnexDsCollectionStats = Field(description="Statistics for all datasets")
+    stats: AnnexDsCollectionStats = Field(
+        description="Statistics for all datasets, as individual repos, "
+        "without any deduplication"
+    )
 
 
 class NonAnnexDsStats(BaseModel):

From 127e9554f40109e7149a3c5ae78ad5437cc70814 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 14:03:54 -0700
Subject: [PATCH 10/57] Modify `CollectionStats.pure_annex_ds_stats`

with the data model that represents it and its description
---
 datalad_registry/blueprints/api/dataset_urls/models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 59344f60..59bbe59e 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -298,8 +298,9 @@ class CollectionStats(BaseModel):
     datalad_ds_stats: DataladDsCollectionStats = Field(
         description="Statistics for DataLad datasets"
     )
-    pure_annex_ds_stats: DataladDsCollectionStats = Field(
-        description="Statistics for pure annex datasets"
+    pure_annex_ds_stats: AnnexDsCollectionStats = Field(
+        description="Statistics for pure annex datasets, as individual repos, "
+        "without any deduplication"
     )
     non_annex_ds_stats: NonAnnexDsStats = Field(
         description="Statistics for non-annex datasets"

From bf70c4ec48f9048a401cc04d51ab4af9db24152c Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 14:06:58 -0700
Subject: [PATCH 11/57] Rename `NonAnnexDsStats` to `NonAnnexDsCollectionStats`

to achieve consistency of naming
---
 datalad_registry/blueprints/api/dataset_urls/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 59bbe59e..8aba7ebc 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -285,7 +285,7 @@ class DataladDsCollectionStats(BaseModel):
     )
 
 
-class NonAnnexDsStats(BaseModel):
+class NonAnnexDsCollectionStats(BaseModel):
     pass
 
 
@@ -302,7 +302,7 @@ class CollectionStats(BaseModel):
         description="Statistics for pure annex datasets, as individual repos, "
         "without any deduplication"
     )
-    non_annex_ds_stats: NonAnnexDsStats = Field(
+    non_annex_ds_stats: NonAnnexDsCollectionStats = Field(
         description="Statistics for non-annex datasets"
     )
 

From 549239348adadf10b88646c7b417823b10f88670 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 14:14:51 -0700
Subject: [PATCH 12/57] Improve description for `StatsSummary.ds_count`

---
 datalad_registry/blueprints/api/dataset_urls/models.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 8aba7ebc..45d46d98 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -291,7 +291,10 @@ class NonAnnexDsCollectionStats(BaseModel):
 
 class StatsSummary(BaseModel):
     unique_ds_count: NonNegativeInt = Field(description="The number of unique datasets")
-    ds_count: NonNegativeInt = Field(description="The number of datasets")
+    ds_count: NonNegativeInt = Field(
+        description="The number of datasets, as individual repos, "
+        "without any deduplication"
+    )
 
 
 class CollectionStats(BaseModel):

From e1e720ca2c6b2f832e630ddce4ba60272f04664a Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 14:20:40 -0700
Subject: [PATCH 13/57] Define `NonAnnexDsCollectionStats`

---
 datalad_registry/blueprints/api/dataset_urls/models.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 45d46d98..1e303a8d 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -286,7 +286,14 @@ class DataladDsCollectionStats(BaseModel):
 
 
 class NonAnnexDsCollectionStats(BaseModel):
-    pass
+    """
+    Model for non-annex dataset collection statistics
+    """
+
+    ds_count: NonNegativeInt = Field(
+        description="The number of datasets, as individual repos, "
+        "without any deduplication"
+    )
 
 
 class StatsSummary(BaseModel):

From 01a90c5e3fa69afc8ecbc85540c2d89d6ecb14fa Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 10 Apr 2024 14:44:25 -0700
Subject: [PATCH 14/57] Improve description of
 `CollectionStats.non_annex_ds_stats`

---
 datalad_registry/blueprints/api/dataset_urls/models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 1e303a8d..60e33ccd 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -313,7 +313,8 @@ class CollectionStats(BaseModel):
         "without any deduplication"
     )
     non_annex_ds_stats: NonAnnexDsCollectionStats = Field(
-        description="Statistics for non-annex datasets"
+        description="Statistics for non-annex datasets, as individual repos, "
+        "without any deduplication"
     )
 
     summary: StatsSummary = Field(description="Summary statistics")

From 15690ad86ab39e5d8338a61ec69fd2d02a2fbea6 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 11 Apr 2024 11:08:00 -0700
Subject: [PATCH 15/57] Improve doc-string of `get_collection_stats`

---
 datalad_registry/blueprints/api/dataset_urls/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py
index e0dcb0fe..0361d820 100644
--- a/datalad_registry/blueprints/api/dataset_urls/__init__.py
+++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py
@@ -187,6 +187,8 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats:
 
     :param select_stmt: The given select statement
     :return: The statistics of the collection of dataset URLs
+
+    Note: The execution of this function requires the Flask app's context
     """
     pass
 

From 619cd36f5a98f30b8d69686db3a123070f4d541b Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 11 Apr 2024 13:52:21 -0700
Subject: [PATCH 16/57] Rename variable

---
 datalad_registry/blueprints/api/dataset_urls/models.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 60e33ccd..b3dd48c7 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -263,9 +263,7 @@ class AnnexDsCollectionStats(BaseModel):
     """
 
     ds_count: NonNegativeInt = Field(description="The number of datasets")
-    size_of_annexed_files: NonNegativeInt = Field(
-        description="The size of annexed files"
-    )
+    annexed_files_size: NonNegativeInt = Field(description="The size of annexed files")
     annexed_file_count: NonNegativeInt = Field(
         description="The number of annexed files"
     )

From 91b07a702c66fbcdd5ff43b6821a327fcd641bcb Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 11 Apr 2024 19:07:32 -0700
Subject: [PATCH 17/57] Define all supporting funcs in a `tools.py`

---
 .../blueprints/api/dataset_urls/__init__.py   |  17 +-
 .../blueprints/api/dataset_urls/tools.py      | 180 ++++++++++++++++++
 2 files changed, 182 insertions(+), 15 deletions(-)
 create mode 100644 datalad_registry/blueprints/api/dataset_urls/tools.py

diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py
index 0361d820..a3efe8a0 100644
--- a/datalad_registry/blueprints/api/dataset_urls/__init__.py
+++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py
@@ -9,7 +9,7 @@
 from flask_openapi3 import APIBlueprint, Tag
 from lark.exceptions import GrammarError, UnexpectedInput
 from psycopg2.errors import UniqueViolation
-from sqlalchemy import ColumnElement, Select, and_, select
+from sqlalchemy import ColumnElement, and_, select
 from sqlalchemy.exc import IntegrityError
 
 from datalad_registry.models import RepoUrl, db
@@ -23,7 +23,6 @@
 from datalad_registry.utils.flask_tools import json_resp_from_str
 
 from .models import (
-    CollectionStats,
     DatasetURLPage,
     DatasetURLRespBaseModel,
     DatasetURLRespModel,
@@ -33,6 +32,7 @@
     PathParams,
     QueryParams,
 )
+from .tools import get_collection_stats
 from .. import (
     API_URL_PREFIX,
     COMMON_API_RESPONSES,
@@ -180,19 +180,6 @@ def declare_dataset_url(body: DatasetURLSubmitModel):
         return json_resp_from_str(resp_model, status=202)
 
 
-def get_collection_stats(select_stmt: Select) -> CollectionStats:
-    """
-    Get the statistics of the collection of dataset URLs specified by the given select
-    statement
-
-    :param select_stmt: The given select statement
-    :return: The statistics of the collection of dataset URLs
-
-    Note: The execution of this function requires the Flask app's context
-    """
-    pass
-
-
 @bp.get("", responses={"200": DatasetURLPage, "400": HTTPExceptionResp})
 def dataset_urls(query: QueryParams):
     """
diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py
new file mode 100644
index 00000000..7fd73fbb
--- /dev/null
+++ b/datalad_registry/blueprints/api/dataset_urls/tools.py
@@ -0,0 +1,180 @@
+from sqlalchemy import Select, Subquery, and_, func, or_, select
+
+from datalad_registry.models import RepoUrl, db
+
+from .models import (
+    AnnexDsCollectionStats,
+    CollectionStats,
+    DataladDsCollectionStats,
+    NonAnnexDsCollectionStats,
+    StatsSummary,
+)
+
+
+def _get_annex_ds_collection_stats(q: Subquery) -> AnnexDsCollectionStats:
+    """
+    Get the stats of a collection of datasets that contains only of annex datasets
+
+    :param q: The query that specifies the collection of datasets under consideration
+    :return: The object representing the stats
+
+    Note: The execution of this function requires the Flask app's context
+    """
+
+    ds_count, annexed_files_size, annexed_file_count = db.session.execute(
+        select(
+            func.count().label("ds_count"),
+            func.sum(q.c.annexed_files_in_wt_size).label("annexed_files_size"),
+            func.sum(q.c.annexed_files_in_wt_count).label("annexed_file_count"),
+        ).select_from(q)
+    ).one()
+
+    return AnnexDsCollectionStats(
+        ds_count=ds_count,
+        annexed_files_size=annexed_files_size,
+        annexed_file_count=annexed_file_count,
+    )
+
+
+def get_unique_dl_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionStats:
+    """
+    Get the stats of the subset of the collection of datasets that contains only
+    of Datalad datasets, considering datasets with the same `ds_id` as the same
+    dataset
+
+    :param base_q: The base query that specified the collection of datasets
+                   under consideration
+    :return: The object representing the stats
+
+    Note: The execution of this function requires the Flask app's context
+    """
+
+    grp_by_id_q = (
+        select(
+            base_q.c.ds_id,
+            func.max(base_q.c.annexed_files_in_wt_size).label(
+                "max_annexed_files_in_wt_size"
+            ),
+        )
+        .group_by(base_q.c.ds_id)
+        .subquery("grp_by_id_q")
+    )
+
+    grp_by_id_and_a_f_size_q = (
+        select(
+            RepoUrl.ds_id,
+            RepoUrl.annexed_files_in_wt_size,
+            func.max(RepoUrl.annexed_files_in_wt_count).label(
+                "annexed_files_in_wt_count"
+            ),
+        )
+        .join(
+            grp_by_id_q,
+            and_(
+                RepoUrl.ds_id == grp_by_id_q.c.ds_id,
+                or_(
+                    grp_by_id_q.c.max_annexed_files_in_wt_size.is_(None),
+                    RepoUrl.annexed_files_in_wt_size
+                    == grp_by_id_q.c.max_annexed_files_in_wt_size,
+                ),
+            ),
+        )
+        .group_by(RepoUrl.ds_id, RepoUrl.annexed_files_in_wt_size)
+        .subquery("grp_by_id_and_a_f_size_q")
+    )
+
+    return _get_annex_ds_collection_stats(grp_by_id_and_a_f_size_q)
+
+
+def get_dl_ds_collection_stats_with_dups(base_q: Subquery) -> AnnexDsCollectionStats:
+    """
+    Get the stats of the subset of the collection of datasets that contains only
+    of Datalad datasets, considering individual repos as a dataset regardless of
+    the value of `ds_id`.
+
+    :param base_q: The base query that specified the collection of datasets
+                   under consideration
+    :return: The object representing the stats
+
+    Note: The execution of this function requires the Flask app's context
+    """
+
+    # Select statement for getting all the Datalad datasets
+    dl_ds_q = select(base_q).filter(base_q.c.ds_id.is_not(None)).subquery("dl_ds_q")
+
+    return _get_annex_ds_collection_stats(dl_ds_q)
+
+
+def get_dl_ds_collection_stats(base_q: Subquery) -> DataladDsCollectionStats:
+    """
+    Get the stats of the subset of the collection of datasets that contains only
+    of Datalad datasets
+
+    :param base_q: The base query that specified the collection of datasets
+                   under consideration
+    :return: The object representing the stats
+
+    Note: The execution of this function requires the Flask app's context
+    """
+
+    return DataladDsCollectionStats(
+        unique_ds_stats=get_unique_dl_ds_collection_stats(base_q),
+        stats=get_dl_ds_collection_stats_with_dups(base_q),
+    )
+
+
+def get_pure_annex_ds_collection_stats() -> AnnexDsCollectionStats:
+    """
+    Get the stats of the subset of the collection of datasets that contains only
+    of pure annex datasets, the annex datasets that are not Datalad datasets
+
+    :return: The object representing the stats
+
+    Note: The execution of this function requires the Flask app's context
+    """
+    # Select statement for getting all the pure annex datasets
+    pass
+
+
+def get_non_annex_ds_collection_stats() -> NonAnnexDsCollectionStats:
+    """
+    Get the stats of the subset of the collection of datasets that contains only
+    of non-annex datasets
+
+    :return: The object representing the stats
+
+    Note: The execution of this function requires the Flask app's context
+    """
+    pass
+
+
+def get_collection_stats(select_stmt: Select) -> CollectionStats:
+    """
+    Get the statistics of the collection of dataset URLs specified by the given select
+    statement
+
+    :param select_stmt: The given select statement
+    :return: The statistics of the collection of dataset URLs
+
+    Note: The execution of this function requires the Flask app's context
+    """
+
+    base_q = select_stmt.subquery("base_q")
+
+    datalad_ds_stats = get_dl_ds_collection_stats(base_q)
+    pure_annex_ds_stats = get_pure_annex_ds_collection_stats()
+    non_annex_ds_stats = get_non_annex_ds_collection_stats()
+
+    # Total number of datasets, as individual repos, without any deduplication
+    ds_count = db.session.execute(
+        select(func.count().label("ds_count")).select_from(base_q)
+    ).scalar_one()
+
+    return CollectionStats(
+        datalad_ds_stats=datalad_ds_stats,
+        pure_annex_ds_stats=pure_annex_ds_stats,
+        non_annex_ds_stats=non_annex_ds_stats,
+        summary=StatsSummary(
+            unique_ds_count=datalad_ds_stats.unique_ds_stats.ds_count, ds_count=ds_count
+        ),
+    )

From e8f28ea1d96e989b0d4bcf6c0bce61f82fcaedc8 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Tue, 16 Apr 2024 18:21:28 -0700
Subject: [PATCH 18/57] Implement getting stats for pure annex ds

---
 .../blueprints/api/dataset_urls/tools.py           | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py
index 7fd73fbb..f59e968e 100644
--- a/datalad_registry/blueprints/api/dataset_urls/tools.py
+++ b/datalad_registry/blueprints/api/dataset_urls/tools.py
@@ -123,17 +123,25 @@ def get_dl_ds_collection_stats(base_q: Subquery) -> DataladDsCollectionStats:
     )
 
 
-def get_pure_annex_ds_collection_stats() -> AnnexDsCollectionStats:
+def get_pure_annex_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionStats:
     """
     Get the stats of the subset of the collection of datasets that contains only
     of pure annex datasets, the annex datasets that are not Datalad datasets
 
+    :param base_q: The base query that specified the collection of datasets
+                   under consideration
     :return: The object representing the stats
 
     Note: The execution of this function requires the Flask app's context
     """
     # Select statement for getting all the pure annex datasets
-    pass
+    pure_annex_ds_q = (
+        select(base_q)
+        .filter(and_(base_q.c.branches.has_key("git-annex"), base_q.c.ds_id.is_(None)))
+        .subquery("pure_annex_ds_q")
+    )
+
+    return _get_annex_ds_collection_stats(pure_annex_ds_q)
 
 
 def get_non_annex_ds_collection_stats() -> NonAnnexDsCollectionStats:
@@ -162,7 +170,7 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats:
     base_q = select_stmt.subquery("base_q")
 
     datalad_ds_stats = get_dl_ds_collection_stats(base_q)
-    pure_annex_ds_stats = get_pure_annex_ds_collection_stats()
+    pure_annex_ds_stats = get_pure_annex_ds_collection_stats(base_q)
     non_annex_ds_stats = get_non_annex_ds_collection_stats()
 
     # Total number of datasets, as individual repos, without any deduplication

From 83d71384a06ed20b30809371928cf92b03e270a5 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 17 Apr 2024 12:25:44 -0700
Subject: [PATCH 19/57] Implement the gathering of stats for non-annex ds

---
 .../blueprints/api/dataset_urls/tools.py      | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py
index f59e968e..594d018b 100644
--- a/datalad_registry/blueprints/api/dataset_urls/tools.py
+++ b/datalad_registry/blueprints/api/dataset_urls/tools.py
@@ -1,4 +1,4 @@
-from sqlalchemy import Select, Subquery, and_, func, or_, select
+from sqlalchemy import Select, Subquery, and_, func, not_, or_, select
 
 from datalad_registry.models import RepoUrl, db
 
@@ -144,16 +144,29 @@ def get_pure_annex_ds_collection_stats(base_q: Subquery) -> AnnexDsCollectionSta
     return _get_annex_ds_collection_stats(pure_annex_ds_q)
 
 
-def get_non_annex_ds_collection_stats() -> NonAnnexDsCollectionStats:
+def get_non_annex_ds_collection_stats(base_q: Subquery) -> NonAnnexDsCollectionStats:
     """
     Get the stats of the subset of the collection of datasets that contains only
     of non-annex datasets
 
+    :param base_q: The base query that specified the collection of datasets
+                   under consideration
     :return: The object representing the stats
 
     Note: The execution of this function requires the Flask app's context
     """
-    pass
+    # Select statement for getting all the non-annex datasets
+    non_annex_ds_q = (
+        select(base_q)
+        .filter(not_(base_q.c.branches.has_key("git-annex")))
+        .subquery("non_annex_ds_q")
+    )
+
+    return NonAnnexDsCollectionStats(
+        ds_count=db.session.execute(
+            select(func.count().label("ds_count")).select_from(non_annex_ds_q)
+        ).scalar_one()
+    )
 
 
 def get_collection_stats(select_stmt: Select) -> CollectionStats:
@@ -171,7 +184,7 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats:
 
     datalad_ds_stats = get_dl_ds_collection_stats(base_q)
     pure_annex_ds_stats = get_pure_annex_ds_collection_stats(base_q)
-    non_annex_ds_stats = get_non_annex_ds_collection_stats()
+    non_annex_ds_stats = get_non_annex_ds_collection_stats(base_q)
 
     # Total number of datasets, as individual repos, without any deduplication
     ds_count = db.session.execute(

From 55728ffdd4b160634668c7b58dd8a91bfe27c737 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 17 Apr 2024 12:31:15 -0700
Subject: [PATCH 20/57] Format: Inline function calls

---
 datalad_registry/blueprints/api/dataset_urls/tools.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py
index 594d018b..8c1a8b55 100644
--- a/datalad_registry/blueprints/api/dataset_urls/tools.py
+++ b/datalad_registry/blueprints/api/dataset_urls/tools.py
@@ -183,8 +183,6 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats:
     base_q = select_stmt.subquery("base_q")
 
     datalad_ds_stats = get_dl_ds_collection_stats(base_q)
-    pure_annex_ds_stats = get_pure_annex_ds_collection_stats(base_q)
-    non_annex_ds_stats = get_non_annex_ds_collection_stats(base_q)
 
     # Total number of datasets, as individual repos, without any deduplication
     ds_count = db.session.execute(
@@ -193,8 +191,8 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats:
 
     return CollectionStats(
         datalad_ds_stats=datalad_ds_stats,
-        pure_annex_ds_stats=pure_annex_ds_stats,
-        non_annex_ds_stats=non_annex_ds_stats,
+        pure_annex_ds_stats=get_pure_annex_ds_collection_stats(base_q),
+        non_annex_ds_stats=get_non_annex_ds_collection_stats(base_q),
         summary=StatsSummary(
             unique_ds_count=datalad_ds_stats.unique_ds_stats.ds_count, ds_count=ds_count
         ),

From 926d8352a5d8d0218756f9de202cb781cb5236bc Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 17 Apr 2024 14:10:06 -0700
Subject: [PATCH 21/57] Make `annexed_files_size` and `annexed_file_count`
 optional

In the case of all rows have the column `annexed_files_in_wt_count`
or `annexed_files_in_wt_size` be `null`, `annexed_files_size` and
`annexed_file_count` can be assigned to `None` respectively
---
 datalad_registry/blueprints/api/dataset_urls/models.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index b3dd48c7..65ff0a65 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -263,9 +263,11 @@ class AnnexDsCollectionStats(BaseModel):
     """
 
     ds_count: NonNegativeInt = Field(description="The number of datasets")
-    annexed_files_size: NonNegativeInt = Field(description="The size of annexed files")
-    annexed_file_count: NonNegativeInt = Field(
-        description="The number of annexed files"
+    annexed_files_size: Optional[NonNegativeInt] = Field(
+        None, description="The size of annexed files"
+    )
+    annexed_file_count: Optional[NonNegativeInt] = Field(
+        None, description="The number of annexed files"
     )
 
 

From 6cb335d8258f5daed64e157755f37bba47c7f141 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 17 Apr 2024 18:45:43 -0700
Subject: [PATCH 22/57] Provide dummy `CollectionStats` in tests of
 `registry-get-urls` client

---
 .../tests/test_get_urls.py                    | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/datalad_registry_client/tests/test_get_urls.py b/datalad_registry_client/tests/test_get_urls.py
index 5dc3e771..bc3f7bae 100644
--- a/datalad_registry_client/tests/test_get_urls.py
+++ b/datalad_registry_client/tests/test_get_urls.py
@@ -9,8 +9,13 @@
 from yarl import URL
 
 from datalad_registry.blueprints.api.dataset_urls.models import (
+    AnnexDsCollectionStats,
+    CollectionStats,
+    DataladDsCollectionStats,
     DatasetURLPage,
     DatasetURLRespModel,
+    NonAnnexDsCollectionStats,
+    StatsSummary,
 )
 from datalad_registry_client import DEFAULT_BASE_ENDPOINT
 
@@ -40,6 +45,20 @@ def __init__(self, status_code, text):
     metadata=[],
 )
 
+# A dummy `AnnexDsCollectionStats` object
+annex_ds_collection_stats = AnnexDsCollectionStats(
+    ds_count=101, annexed_files_size=1900, annexed_file_count=42
+)
+# A dummy `CollectionStats` object
+collection_stats = CollectionStats(
+    datalad_ds_stats=DataladDsCollectionStats(
+        unique_ds_stats=annex_ds_collection_stats, stats=annex_ds_collection_stats
+    ),
+    pure_annex_ds_stats=annex_ds_collection_stats,
+    non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=40),
+    summary=StatsSummary(unique_ds_count=101, ds_count=999),
+)
+
 
 def test_register():
     """
@@ -91,6 +110,7 @@ def mock_get(s, url):  # noqa: U100 Unused argument
                                 url="https://www.example.com"
                             )
                         ],
+                        collection_stats=collection_stats,
                     ).json(exclude_none=True),
                 )
             else:
@@ -142,6 +162,7 @@ def mock_get(s, url):  # noqa: U100 Unused argument
                                 url="https://www.example.com"
                             )
                         ],
+                        collection_stats=collection_stats,
                     ).json(exclude_none=True),
                 )
             else:
@@ -185,6 +206,7 @@ def ds_url_pgs():
                         DatasetURLRespModel(**dataset_url_resp_model_template, url=url)
                         for url in pg
                     ],
+                    collection_stats=collection_stats,
                 )
 
         ds_url_pgs_iter = ds_url_pgs()
@@ -250,6 +272,7 @@ def mock_responses():
                                 url="https://www.example.com"
                             )
                         ],
+                        collection_stats=collection_stats,
                     ).json(exclude_none=True),
                 )
 

From 6fb4df43b961fd1e8ed837ddd7d84b47125f2e71 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Wed, 17 Apr 2024 19:53:13 -0700
Subject: [PATCH 23/57] Remove `total` in `DatasetURLPage`

`total` is now `DatasetURLPage.collection_stats.summary.ds_count`
---
 .../blueprints/api/dataset_urls/__init__.py   |  1 -
 .../blueprints/api/dataset_urls/models.py     |  3 --
 .../test_api/test_dataset_urls.py             |  6 ++--
 .../tests/test_get_urls.py                    | 29 ++++++++++---------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/__init__.py b/datalad_registry/blueprints/api/dataset_urls/__init__.py
index a3efe8a0..cff6f476 100644
--- a/datalad_registry/blueprints/api/dataset_urls/__init__.py
+++ b/datalad_registry/blueprints/api/dataset_urls/__init__.py
@@ -342,7 +342,6 @@ def cache_path_trans(cache_path: Path) -> str:
     assert pagination.total is not None
 
     page = DatasetURLPage(
-        total=pagination.total,
         cur_pg_num=cur_pg_num,
         prev_pg=(
             url_for(ep, **base_qry, page=pagination.prev_num)
diff --git a/datalad_registry/blueprints/api/dataset_urls/models.py b/datalad_registry/blueprints/api/dataset_urls/models.py
index 65ff0a65..d6f3773b 100644
--- a/datalad_registry/blueprints/api/dataset_urls/models.py
+++ b/datalad_registry/blueprints/api/dataset_urls/models.py
@@ -325,9 +325,6 @@ class DatasetURLPage(BaseModel):
     Model for representing a page of dataset URLs in response communication
     """
 
-    total: StrictInt = Field(
-        description="The total number of dataset URLs across all pages"
-    )
     cur_pg_num: StrictInt = Field(description="The number of the current page")
     prev_pg: Optional[StrictStr] = Field(
         None, description="The link to the previous page"
diff --git a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
index 2a5b9387..486fffdb 100644
--- a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
+++ b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
@@ -398,12 +398,12 @@ def test_filter(self, flask_client, query_params, expected_output):
 
         ds_url_page = DatasetURLPage.parse_raw(resp.text)
 
-        assert ds_url_page.total == expected_out_count
         assert ds_url_page.cur_pg_num == DEFAULT_PAGE
         assert ds_url_page.prev_pg is None
         assert ds_url_page.next_pg is None
         assert YURL(ds_url_page.first_pg).query["page"] == "1"
         assert YURL(ds_url_page.last_pg).query["page"] == "1"
+        assert ds_url_page.collection_stats.summary.ds_count == expected_out_count
 
         # Check the collection of dataset URLs
         assert {i.url for i in ds_url_page.dataset_urls} == expected_output
@@ -510,11 +510,11 @@ def test_pagination(self, populate_with_dataset_urls, flask_client):
         resp_json = resp.json
         ds_url_pg = DatasetURLPage.parse_obj(resp_json)
 
-        assert ds_url_pg.total == 4
         assert ds_url_pg.cur_pg_num == 1
         assert "prev_pg" not in resp_json
         assert ds_url_pg.prev_pg is None
         assert ds_url_pg.next_pg is not None
+        assert ds_url_pg.collection_stats.summary.ds_count == 4
 
         next_pg_lk, first_pg_lk, last_pg_lk = (
             YURL(pg)
@@ -548,11 +548,11 @@ def test_pagination(self, populate_with_dataset_urls, flask_client):
         resp_json = resp.json
         ds_url_pg = DatasetURLPage.parse_obj(resp_json)
 
-        assert ds_url_pg.total == 4
         assert ds_url_pg.cur_pg_num == 2
         assert ds_url_pg.prev_pg is not None
         assert "next_pg" not in resp_json
         assert ds_url_pg.next_pg is None
+        assert ds_url_pg.collection_stats.summary.ds_count == 4
 
         prev_pg_lk, first_pg_lk, last_pg_lk = (
             YURL(pg)
diff --git a/datalad_registry_client/tests/test_get_urls.py b/datalad_registry_client/tests/test_get_urls.py
index bc3f7bae..73184529 100644
--- a/datalad_registry_client/tests/test_get_urls.py
+++ b/datalad_registry_client/tests/test_get_urls.py
@@ -45,18 +45,20 @@ def __init__(self, status_code, text):
     metadata=[],
 )
 
-# A dummy `AnnexDsCollectionStats` object
+# Dummy stats objects
 annex_ds_collection_stats = AnnexDsCollectionStats(
     ds_count=101, annexed_files_size=1900, annexed_file_count=42
 )
-# A dummy `CollectionStats` object
+dl_ds_collection_stats = DataladDsCollectionStats(
+    unique_ds_stats=annex_ds_collection_stats, stats=annex_ds_collection_stats
+)
+non_annex_ds_collection_stats = NonAnnexDsCollectionStats(ds_count=40)
+stats_summary = StatsSummary(unique_ds_count=101, ds_count=999)
 collection_stats = CollectionStats(
-    datalad_ds_stats=DataladDsCollectionStats(
-        unique_ds_stats=annex_ds_collection_stats, stats=annex_ds_collection_stats
-    ),
+    datalad_ds_stats=dl_ds_collection_stats,
     pure_annex_ds_stats=annex_ds_collection_stats,
-    non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=40),
-    summary=StatsSummary(unique_ds_count=101, ds_count=999),
+    non_annex_ds_stats=non_annex_ds_collection_stats,
+    summary=stats_summary,
 )
 
 
@@ -98,7 +100,6 @@ def mock_get(s, url):  # noqa: U100 Unused argument
                 return MockResponse(
                     200,
                     DatasetURLPage(
-                        total=200,
                         cur_pg_num=1,
                         prev_pg="dummy",
                         next_pg=None,
@@ -150,7 +151,6 @@ def mock_get(s, url):  # noqa: U100 Unused argument
                 return MockResponse(
                     200,
                     DatasetURLPage(
-                        total=100,
                         cur_pg_num=1,
                         prev_pg="dummy",
                         next_pg=None,
@@ -191,12 +191,11 @@ def test_handle_successful_response(self, resp_pgs: list[list[str]], monkeypatch
         """
 
         def ds_url_pgs():
-            total = sum(len(pg) for pg in resp_pgs)
+            ds_count = sum(len(pg) for pg in resp_pgs)
 
             for i, pg in enumerate(resp_pgs):
                 # noinspection PyTypeChecker
                 yield DatasetURLPage(
-                    total=total,
                     cur_pg_num=i + 1,
                     prev_pg=None if i == 0 else "foo",
                     next_pg=None if i == len(resp_pgs) - 1 else "foo",
@@ -206,7 +205,12 @@ def ds_url_pgs():
                         DatasetURLRespModel(**dataset_url_resp_model_template, url=url)
                         for url in pg
                     ],
-                    collection_stats=collection_stats,
+                    collection_stats=CollectionStats(
+                        datalad_ds_stats=dl_ds_collection_stats,
+                        pure_annex_ds_stats=annex_ds_collection_stats,
+                        non_annex_ds_stats=non_annex_ds_collection_stats,
+                        summary=StatsSummary(unique_ds_count=101, ds_count=ds_count),
+                    ),
                 )
 
         ds_url_pgs_iter = ds_url_pgs()
@@ -260,7 +264,6 @@ def mock_responses():
                 yield MockResponse(
                     200,
                     DatasetURLPage(
-                        total=200,
                         cur_pg_num=i + 1,
                         prev_pg=None if i == 0 else "foo",
                         next_pg="bar",

From e9f027bfed9d8d3bc7e6a858c339450d26a850fa Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 18 Apr 2024 10:38:48 -0700
Subject: [PATCH 24/57] RF: Take out the URL population logic from the fixture

The logic can be reuse this way
---
 datalad_registry/tests/conftest.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/datalad_registry/tests/conftest.py b/datalad_registry/tests/conftest.py
index 00fefd81..4388095e 100644
--- a/datalad_registry/tests/conftest.py
+++ b/datalad_registry/tests/conftest.py
@@ -258,12 +258,28 @@ def populate_with_2_dataset_urls(flask_app):
         db.session.commit()
 
 
+def _populate_with_dataset_urls(urls: list[RepoUrl], flask_app):
+    """
+    Populate the `repo_url` table with a list of RepoUrl objects
+
+    :param urls: The list of RepoUrl objects to populate
+    :return: The list of URLs, expressed in `str`, that were added to the database
+    """
+
+    with flask_app.app_context():
+        for url in urls:
+            db.session.add(url)
+        db.session.commit()
+
+        return [url.url for url in urls]
+
+
 @pytest.fixture
 def populate_with_dataset_urls(flask_app) -> list[str]:
     """
-    Populate the url table with a list of DatasetURLs.
+    Populate the `repo_url` table with a list of RepoUrl objects
 
-    Returns: The list of DatasetURLs that were added to the database
+    Returns: The list of URLs, expressed in `str`, that were added to the database
     """
 
     urls = [
@@ -319,12 +335,7 @@ def populate_with_dataset_urls(flask_app) -> list[str]:
         ),
     ]
 
-    with flask_app.app_context():
-        for url in urls:
-            db.session.add(url)
-        db.session.commit()
-
-        return [url.url for url in urls]
+    return _populate_with_dataset_urls(urls, flask_app)
 
 
 @pytest.fixture

From f1f8887a4b579bbefdc38282611abea727f85e18 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 18 Apr 2024 10:45:58 -0700
Subject: [PATCH 25/57] Rename pytest fixture

---
 datalad_registry/tests/conftest.py                   |  6 +++---
 .../test_blueprints/test_api/test_dataset_urls.py    | 12 ++++++------
 datalad_registry/tests/test_overview.py              | 10 +++++-----
 datalad_registry/tests/test_search.py                |  2 +-
 .../tests/test_tasks/test_chk_url_to_update.py       |  6 +++---
 .../tests/test_tasks/test_mark_for_chk.py            |  4 ++--
 6 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/datalad_registry/tests/conftest.py b/datalad_registry/tests/conftest.py
index 4388095e..20f6a7c1 100644
--- a/datalad_registry/tests/conftest.py
+++ b/datalad_registry/tests/conftest.py
@@ -275,9 +275,9 @@ def _populate_with_dataset_urls(urls: list[RepoUrl], flask_app):
 
 
 @pytest.fixture
-def populate_with_dataset_urls(flask_app) -> list[str]:
+def populate_with_std_ds_urls(flask_app) -> list[str]:
     """
-    Populate the `repo_url` table with a list of RepoUrl objects
+    Populate the `repo_url` table with a list of standard (typical) RepoUrl objects
 
     Returns: The list of URLs, expressed in `str`, that were added to the database
     """
@@ -340,7 +340,7 @@ def populate_with_dataset_urls(flask_app) -> list[str]:
 
 @pytest.fixture
 def populate_with_url_metadata(
-    populate_with_dataset_urls,  # noqa: U100 (unused argument)
+    populate_with_std_ds_urls,  # noqa: U100 (unused argument)
     flask_app,
 ):
     """
diff --git a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
index 486fffdb..2f13e920 100644
--- a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
+++ b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
@@ -119,7 +119,7 @@ def mock_commit(_scoped_session_obj):
                 "/api/v2/dataset-urls", json={"url": "https://www.example.com"}
             )
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "url, expected_mark_for_chk_delay_args",
         [
@@ -255,7 +255,7 @@ def test_valid_query_params(self, flask_client, query_params):
         resp = flask_client.get("/api/v2/dataset-urls", query_string=query_params)
         assert resp.status_code == 200
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "query_params, expected_output",
         [
@@ -408,7 +408,7 @@ def test_filter(self, flask_client, query_params, expected_output):
         # Check the collection of dataset URLs
         assert {i.url for i in ds_url_page.dataset_urls} == expected_output
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "query_params",
         [
@@ -494,7 +494,7 @@ def test_metadata_return(self, metadata_ret_opt, flask_client):
 
                 assert all(type(m) is metadata_ret_type for m in url.metadata)
 
-    def test_pagination(self, populate_with_dataset_urls, flask_client):
+    def test_pagination(self, populate_with_std_ds_urls, flask_client):
         """
         Test the pagination of the results
         """
@@ -578,9 +578,9 @@ def test_pagination(self, populate_with_dataset_urls, flask_client):
         for url in ds_url_pg.dataset_urls:
             ds_urls.add(str(url.url))
 
-        assert ds_urls == set(populate_with_dataset_urls)
+        assert ds_urls == set(populate_with_std_ds_urls)
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "query_params, expected_results_by_id_prefix",
         [
diff --git a/datalad_registry/tests/test_overview.py b/datalad_registry/tests/test_overview.py
index 3472ec65..2763f051 100644
--- a/datalad_registry/tests/test_overview.py
+++ b/datalad_registry/tests/test_overview.py
@@ -6,7 +6,7 @@
 
 
 class TestOverView:
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "sort_by, expected_order",
         [
@@ -143,7 +143,7 @@ def test_sorting(
 
         assert url_list == expected_order
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "search_query, expected_results",
         [
@@ -180,7 +180,7 @@ def test_search_with_valid_query(
 
         assert url_list == expected_results
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "search_query, err_msg_prefix",
         [
@@ -206,7 +206,7 @@ def test_search_with_invalid_query(
         assert (error_span := soup.find("span", class_="error"))
         assert error_span.text.startswith(f"ERROR: {err_msg_prefix}")
 
-    def test_pagination(self, populate_with_dataset_urls, flask_client):
+    def test_pagination(self, populate_with_std_ds_urls, flask_client):
         """
         Test pagination in Web UI
         """
@@ -272,7 +272,7 @@ def test_pagination(self, populate_with_dataset_urls, flask_client):
         assert page_1_link.query["per_page"] == "2"
         assert page_1_link.query["sort"] == "update-desc"
 
-        assert ds_urls == set(populate_with_dataset_urls)
+        assert ds_urls == set(populate_with_std_ds_urls)
 
     @pytest.mark.usefixtures("populate_with_url_metadata")
     def test_metadata(self, flask_client):
diff --git a/datalad_registry/tests/test_search.py b/datalad_registry/tests/test_search.py
index 137e276f..39f85ec1 100644
--- a/datalad_registry/tests/test_search.py
+++ b/datalad_registry/tests/test_search.py
@@ -12,7 +12,7 @@
 
 @pytest.fixture
 def populate_with_url_metadata_for_search(
-    populate_with_dataset_urls,  # noqa: U100 (unused argument)
+    populate_with_std_ds_urls,  # noqa: U100 (unused argument)
     flask_app,
 ):
     """
diff --git a/datalad_registry/tests/test_tasks/test_chk_url_to_update.py b/datalad_registry/tests/test_tasks/test_chk_url_to_update.py
index 525cc0ff..ea351912 100644
--- a/datalad_registry/tests/test_tasks/test_chk_url_to_update.py
+++ b/datalad_registry/tests/test_tasks/test_chk_url_to_update.py
@@ -13,7 +13,7 @@
 # and the db and the cache are clean
 @pytest.mark.usefixtures("flask_app")
 class TestChkUrlToUpdate:
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize("invalid_url_id", [-1, 0, 5, 10])
     def test_repo_url_not_found(self, invalid_url_id):
         """
@@ -21,7 +21,7 @@ def test_repo_url_not_found(self, invalid_url_id):
         """
         assert chk_url_to_update(invalid_url_id, None) is ChkUrlStatus.ABORTED
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "url_id, initial_last_chk_dt",
         [
@@ -38,7 +38,7 @@ def test_chk_handled_by_another_process(self, url_id, initial_last_chk_dt):
         """
         assert chk_url_to_update(url_id, initial_last_chk_dt) is ChkUrlStatus.SKIPPED
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls", "fix_datetime_now")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls", "fix_datetime_now")
     @pytest.mark.parametrize(
         "url_id, initial_last_chk_dt, original_n_failed_chks, original_chk_req_dt",
         [
diff --git a/datalad_registry/tests/test_tasks/test_mark_for_chk.py b/datalad_registry/tests/test_tasks/test_mark_for_chk.py
index 079c5c63..696c029d 100644
--- a/datalad_registry/tests/test_tasks/test_mark_for_chk.py
+++ b/datalad_registry/tests/test_tasks/test_mark_for_chk.py
@@ -11,7 +11,7 @@
 # and the db and the cache are clean
 @pytest.mark.usefixtures("flask_app")
 class TestMarkForChk:
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize("url_id", [5, 42])
     def test_non_existing_url(self, url_id, mocker: MockerFixture):
         """
@@ -32,7 +32,7 @@ def now(cls, *args, **kwargs):
 
         datetime_mock.now.assert_not_called()
 
-    @pytest.mark.usefixtures("populate_with_dataset_urls")
+    @pytest.mark.usefixtures("populate_with_std_ds_urls")
     @pytest.mark.parametrize(
         "url_id, original_chk_req_dt, expecting_chk_req_dt_changed",
         [

From f1841822a199ac655665b8d4a9def625acceae06 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 18 Apr 2024 10:57:42 -0700
Subject: [PATCH 26/57] RF: Move the URL population logic to `tools.py`

This allows reuse of the logic by tests in different files
---
 datalad_registry/tests/conftest.py | 20 +++-----------------
 datalad_registry/tests/tools.py    | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 17 deletions(-)
 create mode 100644 datalad_registry/tests/tools.py

diff --git a/datalad_registry/tests/conftest.py b/datalad_registry/tests/conftest.py
index 20f6a7c1..410ba586 100644
--- a/datalad_registry/tests/conftest.py
+++ b/datalad_registry/tests/conftest.py
@@ -16,6 +16,8 @@
 from datalad_registry.models import RepoUrl, URLMetadata, db
 from datalad_registry.utils.datalad_tls import clone
 
+from .tools import populate_with_dataset_urls
+
 
 @pytest.fixture(scope="session")
 def set_test_env(tmp_path_factory):
@@ -258,22 +260,6 @@ def populate_with_2_dataset_urls(flask_app):
         db.session.commit()
 
 
-def _populate_with_dataset_urls(urls: list[RepoUrl], flask_app):
-    """
-    Populate the `repo_url` table with a list of RepoUrl objects
-
-    :param urls: The list of RepoUrl objects to populate
-    :return: The list of URLs, expressed in `str`, that were added to the database
-    """
-
-    with flask_app.app_context():
-        for url in urls:
-            db.session.add(url)
-        db.session.commit()
-
-        return [url.url for url in urls]
-
-
 @pytest.fixture
 def populate_with_std_ds_urls(flask_app) -> list[str]:
     """
@@ -335,7 +321,7 @@ def populate_with_std_ds_urls(flask_app) -> list[str]:
         ),
     ]
 
-    return _populate_with_dataset_urls(urls, flask_app)
+    return populate_with_dataset_urls(urls, flask_app)
 
 
 @pytest.fixture
diff --git a/datalad_registry/tests/tools.py b/datalad_registry/tests/tools.py
new file mode 100644
index 00000000..3c64dae4
--- /dev/null
+++ b/datalad_registry/tests/tools.py
@@ -0,0 +1,21 @@
+# This file contains helper functions for testing purposes
+
+from datalad_registry.models import RepoUrl, db
+
+
+def populate_with_dataset_urls(urls: list[RepoUrl], flask_app):
+    """
+    Populate the `repo_url` table with a list of RepoUrl objects
+
+    :param urls: The list of RepoUrl objects to populate
+    :param flask_app: The Flask app instance which provides the context for
+                      database access
+    :return: The list of URLs, expressed in `str`, that were added to the database
+    """
+
+    with flask_app.app_context():
+        for url in urls:
+            db.session.add(url)
+        db.session.commit()
+
+        return [url.url for url in urls]

From bda5f9c4f072ee3bf9f7f9d3f1d67de5b9b5a8e3 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 18 Apr 2024 11:03:06 -0700
Subject: [PATCH 27/57] RF: Simplify the expression to add RepoUrls to DB

---
 datalad_registry/tests/tools.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/datalad_registry/tests/tools.py b/datalad_registry/tests/tools.py
index 3c64dae4..34c7d12d 100644
--- a/datalad_registry/tests/tools.py
+++ b/datalad_registry/tests/tools.py
@@ -14,8 +14,7 @@ def populate_with_dataset_urls(urls: list[RepoUrl], flask_app):
     """
 
     with flask_app.app_context():
-        for url in urls:
-            db.session.add(url)
+        db.session.add_all(urls)
         db.session.commit()
 
         return [url.url for url in urls]

From ee603688d7e3971ced73b2fd45c5b8708b4509f3 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 18 Apr 2024 12:23:04 -0700
Subject: [PATCH 28/57] Fully type annotate `populate_with_dataset_urls`

---
 datalad_registry/tests/tools.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/tests/tools.py b/datalad_registry/tests/tools.py
index 34c7d12d..f07a9ca7 100644
--- a/datalad_registry/tests/tools.py
+++ b/datalad_registry/tests/tools.py
@@ -1,9 +1,11 @@
 # This file contains helper functions for testing purposes
 
+from flask import Flask
+
 from datalad_registry.models import RepoUrl, db
 
 
-def populate_with_dataset_urls(urls: list[RepoUrl], flask_app):
+def populate_with_dataset_urls(urls: list[RepoUrl], flask_app: Flask) -> list[str]:
     """
     Populate the `repo_url` table with a list of RepoUrl objects
 

From e8c0d1c8e02680032dc7c2516630e4f824fb6c07 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 18 Apr 2024 17:09:50 -0700
Subject: [PATCH 29/57] Provide tests for return dataset collect stats

---
 .../test_api/test_dataset_urls.py             | 243 ++++++++++++++++++
 1 file changed, 243 insertions(+)

diff --git a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
index 2f13e920..9e22c6f1 100644
--- a/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
+++ b/datalad_registry/tests/test_blueprints/test_api/test_dataset_urls.py
@@ -7,14 +7,21 @@
 from datalad_registry.blueprints.api.dataset_urls import DatasetURLRespModel
 from datalad_registry.blueprints.api.dataset_urls.models import (
     DEFAULT_PAGE,
+    AnnexDsCollectionStats,
+    CollectionStats,
+    DataladDsCollectionStats,
     DatasetURLPage,
     MetadataReturnOption,
+    NonAnnexDsCollectionStats,
+    StatsSummary,
 )
 from datalad_registry.blueprints.api.url_metadata.models import (
     URLMetadataModel,
     URLMetadataRef,
 )
 from datalad_registry.conf import OperationMode
+from datalad_registry.models import RepoUrl
+from datalad_registry.tests.tools import populate_with_dataset_urls
 
 
 class TestDeclareDatasetURL:
@@ -675,6 +682,242 @@ def test_ordering(self, query_params, expected_results_by_id_prefix, flask_clien
             == expected_results_by_id_prefix
         )
 
+    @pytest.mark.parametrize(
+        "query_params, expected_stats",
+        [
+            (
+                {},
+                CollectionStats(
+                    datalad_ds_stats=DataladDsCollectionStats(
+                        unique_ds_stats=AnnexDsCollectionStats(
+                            ds_count=3,
+                            annexed_files_size=400 + 1001,
+                            annexed_file_count=50 + 100 + 150,
+                        ),
+                        stats=AnnexDsCollectionStats(
+                            ds_count=6,
+                            annexed_files_size=1000 + 1001 + 400,
+                            annexed_file_count=120 + 50 + 100 + 120 + 150 + 130,
+                        ),
+                    ),
+                    pure_annex_ds_stats=AnnexDsCollectionStats(
+                        ds_count=1, annexed_files_size=600, annexed_file_count=100
+                    ),
+                    non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=1),
+                    summary=StatsSummary(unique_ds_count=3, ds_count=9),
+                ),
+            ),
+            (
+                {"search": "url:datalad"},
+                CollectionStats(
+                    datalad_ds_stats=DataladDsCollectionStats(
+                        unique_ds_stats=AnnexDsCollectionStats(
+                            ds_count=2,
+                            annexed_files_size=1000 + 400,
+                            annexed_file_count=120 + 50,
+                        ),
+                        stats=AnnexDsCollectionStats(
+                            ds_count=2,
+                            annexed_files_size=1000 + 400,
+                            annexed_file_count=120 + 50,
+                        ),
+                    ),
+                    pure_annex_ds_stats=AnnexDsCollectionStats(
+                        ds_count=0, annexed_files_size=None, annexed_file_count=None
+                    ),
+                    non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=0),
+                    summary=StatsSummary(unique_ds_count=2, ds_count=2),
+                ),
+            ),
+            (
+                {"search": "url:.org"},
+                CollectionStats(
+                    datalad_ds_stats=DataladDsCollectionStats(
+                        unique_ds_stats=AnnexDsCollectionStats(
+                            ds_count=2,
+                            annexed_files_size=1000 + 400,
+                            annexed_file_count=120 + 50,
+                        ),
+                        stats=AnnexDsCollectionStats(
+                            ds_count=2,
+                            annexed_files_size=1000 + 400,
+                            annexed_file_count=120 + 50,
+                        ),
+                    ),
+                    pure_annex_ds_stats=AnnexDsCollectionStats(
+                        ds_count=0, annexed_files_size=None, annexed_file_count=None
+                    ),
+                    non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=1),
+                    summary=StatsSummary(unique_ds_count=2, ds_count=4),
+                ),
+            ),
+            (
+                # === The case of an empty set of dataset URLs returned ===
+                {"search": "url:.tv"},
+                CollectionStats(
+                    datalad_ds_stats=DataladDsCollectionStats(
+                        unique_ds_stats=AnnexDsCollectionStats(
+                            ds_count=0,
+                            annexed_files_size=None,
+                            annexed_file_count=None,
+                        ),
+                        stats=AnnexDsCollectionStats(
+                            ds_count=0,
+                            annexed_files_size=None,
+                            annexed_file_count=None,
+                        ),
+                    ),
+                    pure_annex_ds_stats=AnnexDsCollectionStats(
+                        ds_count=0, annexed_files_size=None, annexed_file_count=None
+                    ),
+                    non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=0),
+                    summary=StatsSummary(unique_ds_count=0, ds_count=0),
+                ),
+            ),
+            (
+                {"search": "url:distribits.live"},
+                CollectionStats(
+                    datalad_ds_stats=DataladDsCollectionStats(
+                        unique_ds_stats=AnnexDsCollectionStats(
+                            ds_count=2,
+                            annexed_files_size=1001,
+                            annexed_file_count=100 + 150,
+                        ),
+                        stats=AnnexDsCollectionStats(
+                            ds_count=4,
+                            annexed_files_size=1001,
+                            annexed_file_count=100 + 120 + 150 + 130,
+                        ),
+                    ),
+                    pure_annex_ds_stats=AnnexDsCollectionStats(
+                        ds_count=0, annexed_files_size=None, annexed_file_count=None
+                    ),
+                    non_annex_ds_stats=NonAnnexDsCollectionStats(ds_count=0),
+                    summary=StatsSummary(unique_ds_count=2, ds_count=4),
+                ),
+            ),
+        ],
+    )
+    def test_stats(self, query_params, expected_stats, flask_app, flask_client):
+        """
+        Test the compilation of stats regarding the returned dataset URLs
+        """
+
+        # Populate the DB with dataset URLs suitable for testing the stats
+        urls = [
+            RepoUrl(
+                url="https://www.example.com",
+                ds_id=None,
+                annexed_files_in_wt_count=100,
+                annexed_files_in_wt_size=600,
+                branches={
+                    "git-annex": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fd6db",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    }
+                },
+            ),
+            RepoUrl(
+                url="http://www.datalad.org",
+                ds_id="2a0b7b7b-a984-4c4a-844c-be3132291a7c",
+                annexed_files_in_wt_count=120,
+                annexed_files_in_wt_size=1000,
+                branches={
+                    "git-annex": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fd6ta",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    }
+                },
+            ),
+            RepoUrl(
+                url="https://handbook.datalad.org",
+                ds_id="2b73b99e-59cc-4f35-833a-69c75ca5b0c5",
+                annexed_files_in_wt_count=50,
+                annexed_files_in_wt_size=400,
+                branches={
+                    "git-annex": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fd6cc",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    }
+                },
+            ),
+            RepoUrl(
+                url="https://www.dandiarchive.org",
+                ds_id=None,
+                annexed_files_in_wt_count=100,
+                annexed_files_in_wt_size=300,
+            ),
+            RepoUrl(
+                url="https://distribits.live",
+                ds_id="2a0b7b7b-a984-4c4a-844c-be3132291a7c",
+                annexed_files_in_wt_count=100,
+                annexed_files_in_wt_size=1001,
+                branches={
+                    "git-annex": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    }
+                },
+            ),
+            RepoUrl(
+                url="https://distribits.live/1",
+                ds_id="48185fb3-aa80-47b4-8ab1-1d7d9fc8b192",
+                annexed_files_in_wt_count=120,
+                annexed_files_in_wt_size=None,
+                branches={
+                    "git-annex": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    }
+                },
+            ),
+            RepoUrl(
+                url="https://distribits.live/2",
+                ds_id="48185fb3-aa80-47b4-8ab1-1d7d9fc8b192",
+                annexed_files_in_wt_count=150,
+                annexed_files_in_wt_size=None,
+                branches={
+                    "git-annex": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    }
+                },
+            ),
+            RepoUrl(
+                url="https://distribits.live/3",
+                ds_id="48185fb3-aa80-47b4-8ab1-1d7d9fc8b192",
+                annexed_files_in_wt_count=130,
+                annexed_files_in_wt_size=None,
+                branches={
+                    "git-annex": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    }
+                },
+            ),
+            RepoUrl(
+                url="https://centerforopenneuroscience.org",
+                ds_id=None,
+                annexed_files_in_wt_count=None,
+                annexed_files_in_wt_size=None,
+                branches={
+                    "main": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7168fdaba",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    },
+                    "dev": {
+                        "hexsha": "f21cff198ce84438bd60d459577401d7175fdaba",
+                        "last_commit_dt": "2022-11-18T19:18:23+00:00",
+                    },
+                },
+            ),
+        ]
+        populate_with_dataset_urls(urls, flask_app)
+
+        resp = flask_client.get("/api/v2/dataset-urls", query_string=query_params)
+
+        assert DatasetURLPage.parse_raw(resp.text).collection_stats == expected_stats
+
 
 @pytest.mark.usefixtures("populate_with_2_dataset_urls")
 class TestDatasetURL:

From 80df6ac67cf20c41ea6b7ac15803a8fe2f54edef Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 10:21:39 -0700
Subject: [PATCH 30/57] Remove grouping by `RepoUrl`

This statement is not needed because `RepoUrl` objects are already
unique
---
 datalad_registry/overview.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py
index f8c5915f..a75ebf2d 100644
--- a/datalad_registry/overview.py
+++ b/datalad_registry/overview.py
@@ -49,7 +49,6 @@ def overview():  # No type hints due to mypy#7187.
             select_stmt = select_stmt.filter(criteria)
 
     # Sort
-    select_stmt = select_stmt.group_by(RepoUrl)
     sort_by = request.args.get("sort", default_sort_scheme, type=str)
     if sort_by not in _SORT_ATTRS:
         lgr.debug("Ignoring unknown sort parameter: %s", sort_by)

From b124dc339a7eec02a95feec88cd74458d7c0aba3 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 10:25:27 -0700
Subject: [PATCH 31/57] Reorganize code with improved comments

---
 datalad_registry/overview.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py
index a75ebf2d..27f03eeb 100644
--- a/datalad_registry/overview.py
+++ b/datalad_registry/overview.py
@@ -48,12 +48,14 @@ def overview():  # No type hints due to mypy#7187.
         else:
             select_stmt = select_stmt.filter(criteria)
 
-    # Sort
+    # Decipher sorting scheme
     sort_by = request.args.get("sort", default_sort_scheme, type=str)
     if sort_by not in _SORT_ATTRS:
         lgr.debug("Ignoring unknown sort parameter: %s", sort_by)
         sort_by = default_sort_scheme
     col, sort_method = _SORT_ATTRS[sort_by]
+
+    # Apply sorting
     select_stmt = select_stmt.order_by(
         nullslast(getattr(getattr(RepoUrl, col), sort_method)())
     )

From f0db49c38e342e1c9aa85fc678ac2c76520a2cb8 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 10:32:13 -0700
Subject: [PATCH 32/57] Capture the "base select statement"

This statement exclude the ordering of the elements.
Gathering stats using this statement results in simpler query.
---
 datalad_registry/overview.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py
index 27f03eeb..4b99ac9a 100644
--- a/datalad_registry/overview.py
+++ b/datalad_registry/overview.py
@@ -32,7 +32,7 @@
 def overview():  # No type hints due to mypy#7187.
     default_sort_scheme = "update-desc"
 
-    select_stmt = select(RepoUrl)
+    base_select_stmt = select(RepoUrl)
 
     # Search using query if provided.
     # ATM it is just a 'filter' on URL records, later might be more complex
@@ -46,7 +46,7 @@ def overview():  # No type hints due to mypy#7187.
         except Exception as e:
             search_error = str(e)
         else:
-            select_stmt = select_stmt.filter(criteria)
+            base_select_stmt = base_select_stmt.filter(criteria)
 
     # Decipher sorting scheme
     sort_by = request.args.get("sort", default_sort_scheme, type=str)
@@ -56,7 +56,7 @@ def overview():  # No type hints due to mypy#7187.
     col, sort_method = _SORT_ATTRS[sort_by]
 
     # Apply sorting
-    select_stmt = select_stmt.order_by(
+    select_stmt = base_select_stmt.order_by(
         nullslast(getattr(getattr(RepoUrl, col), sort_method)())
     )
 

From 177db1651de86ba3f2ed04e13309aec8dc99388d Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 10:40:12 -0700
Subject: [PATCH 33/57] Gather returned dataset collection stats and pass it to
 web UI

---
 datalad_registry/overview.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py
index 4b99ac9a..4e151abd 100644
--- a/datalad_registry/overview.py
+++ b/datalad_registry/overview.py
@@ -6,6 +6,7 @@
 from flask import Blueprint, render_template, request
 from sqlalchemy import nullslast, select
 
+from datalad_registry.blueprints.api.dataset_urls.tools import get_collection_stats
 from datalad_registry.models import RepoUrl, db
 from datalad_registry.search import parse_query
 
@@ -63,9 +64,13 @@ def overview():  # No type hints due to mypy#7187.
     # Paginate
     pagination = db.paginate(select_stmt)
 
+    # Gather stats of the returned collection of datasets
+    stats = get_collection_stats(base_select_stmt)
+
     return render_template(
         "overview.html",
         pagination=pagination,
+        stats=stats,
         sort_by=sort_by,
         search_query=query,
         search_error=search_error,

From 7581a3eb097fcedfb9888d50b5bd9a14f2ff1a62 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 11:49:30 -0700
Subject: [PATCH 34/57] Fix importing of `OperationMode`

The current import directly from `datalad_registry` causes
circulating import error
---
 datalad_registry/blueprints/api/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/utils.py b/datalad_registry/blueprints/api/utils.py
index f8265727..41422e1d 100644
--- a/datalad_registry/blueprints/api/utils.py
+++ b/datalad_registry/blueprints/api/utils.py
@@ -3,7 +3,7 @@
 
 from flask import current_app, request
 
-from datalad_registry import OperationMode
+from datalad_registry.conf import OperationMode
 from datalad_registry.utils.flask_tools import json_resp_from_str
 
 from . import HTTPExceptionResp

From 8d160a3f7877b85bef16b26af5756342b57ab4b2 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 13:22:21 -0700
Subject: [PATCH 35/57] Provide stats at the end of web UI

---
 datalad_registry/templates/overview.html | 68 ++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index d2287ce8..dea5961e 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -178,6 +178,74 @@ <h2>Search query syntax</h2>
     <div class="pager">
       {{ render_pagination_widget(pagination, '.overview') }}
     </div>
+
+    <!-- Statistics of the returned collection of datasets -->
+    <div id="stats">
+      <h2>Stats</h2>
+      <ul>
+        <li>
+          <h3>Datalad Datasets Stats</h3>
+          <ul>
+            <li>
+              <h4>Unique Datalad Dataset Stats</h4>
+              <ul>
+                <li>Count: {{ stats.datalad_ds_stats.unique_ds_stats.ds_count }}</li>
+                {% if stats.datalad_ds_stats.unique_ds_stats.annexed_file_count is not none %}
+                  <li>Annexed file
+                    count: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_file_count }}</li>
+                {% endif %}
+                {% if stats.datalad_ds_stats.unique_ds_stats.annexed_files_size is not none %}
+                  <li>Annexed files
+                    size: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_files_size }}</li>
+                {% endif %}
+              </ul>
+            </li>
+            <li>
+              <h4>Stats without deduplication</h4>
+              <ul>
+                <li>Count: {{ stats.datalad_ds_stats.stats.ds_count }}</li>
+                {% if stats.datalad_ds_stats.stats.annexed_file_count is not none %}
+                  <li>Annexed file
+                    count: {{ stats.datalad_ds_stats.stats.annexed_file_count }}</li>
+                {% endif %}
+                {% if stats.datalad_ds_stats.stats.annexed_files_size is not none %}
+                  <li>Annexed files
+                    size: {{ stats.datalad_ds_stats.stats.annexed_files_size }}</li>
+                {% endif %}
+              </ul>
+            </li>
+          </ul>
+        </li>
+        <li>
+          <h3>Pure Annex Dataset Stats</h3>
+          <ul>
+            <li>Count: {{ stats.pure_annex_ds_stats.ds_count }}</li>
+            {% if stats.pure_annex_ds_stats.annexed_file_count is not none %}
+              <li>Annexed file
+                count: {{ stats.pure_annex_ds_stats.annexed_file_count }}</li>
+            {% endif %}
+            {% if stats.pure_annex_ds_stats.annexed_files_size is not none %}
+              <li>Annexed files
+                size: {{ stats.pure_annex_ds_stats.annexed_files_size }}</li>
+            {% endif %}
+          </ul>
+        </li>
+        <li>
+          <h3>Non-Annex Dataset Stats</h3>
+          <ul>
+            <li>Count: {{ stats.non_annex_ds_stats.ds_count }}</li>
+          </ul>
+        </li>
+        <li>
+          <h3>Summary</h3>
+          <ul>
+            <li>Unique dataset count: {{ stats.summary.unique_ds_count }}</li>
+            <li>Total dataset count (without
+              deduplication): {{ stats.summary.ds_count }}</li>
+          </ul>
+        </li>
+      </ul>
+    </div>
   </div>
 </div>
 

From 69490edd64020c7c5be948e442906875e9550b3f Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 13:41:53 -0700
Subject: [PATCH 36/57] RF: Use a macro to render stats for annex dataset
 collection

---
 datalad_registry/templates/overview.html | 50 ++++++++----------------
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index dea5961e..685ffb10 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -27,6 +27,20 @@
   </div>
 {% endmacro %}
 
+{% macro render_annex_ds_collection_stats(annex_ds_col_stats) %}
+  <ul>
+    <li>Count: {{ annex_ds_col_stats.ds_count }}</li>
+    {% if annex_ds_col_stats.annexed_file_count is not none %}
+      <li>Annexed file
+        count: {{ annex_ds_col_stats.annexed_file_count }}</li>
+    {% endif %}
+    {% if annex_ds_col_stats.annexed_files_size is not none %}
+      <li>Annexed files
+        size: {{ annex_ds_col_stats.annexed_files_size }}</li>
+    {% endif %}
+  </ul>
+{% endmacro %}
+
 <!DOCTYPE html>
 <html lang="en">
 <head>
@@ -188,47 +202,17 @@ <h3>Datalad Datasets Stats</h3>
           <ul>
             <li>
               <h4>Unique Datalad Dataset Stats</h4>
-              <ul>
-                <li>Count: {{ stats.datalad_ds_stats.unique_ds_stats.ds_count }}</li>
-                {% if stats.datalad_ds_stats.unique_ds_stats.annexed_file_count is not none %}
-                  <li>Annexed file
-                    count: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_file_count }}</li>
-                {% endif %}
-                {% if stats.datalad_ds_stats.unique_ds_stats.annexed_files_size is not none %}
-                  <li>Annexed files
-                    size: {{ stats.datalad_ds_stats.unique_ds_stats.annexed_files_size }}</li>
-                {% endif %}
-              </ul>
+              {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.unique_ds_stats) }}
             </li>
             <li>
               <h4>Stats without deduplication</h4>
-              <ul>
-                <li>Count: {{ stats.datalad_ds_stats.stats.ds_count }}</li>
-                {% if stats.datalad_ds_stats.stats.annexed_file_count is not none %}
-                  <li>Annexed file
-                    count: {{ stats.datalad_ds_stats.stats.annexed_file_count }}</li>
-                {% endif %}
-                {% if stats.datalad_ds_stats.stats.annexed_files_size is not none %}
-                  <li>Annexed files
-                    size: {{ stats.datalad_ds_stats.stats.annexed_files_size }}</li>
-                {% endif %}
-              </ul>
+              {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.stats) }}
             </li>
           </ul>
         </li>
         <li>
           <h3>Pure Annex Dataset Stats</h3>
-          <ul>
-            <li>Count: {{ stats.pure_annex_ds_stats.ds_count }}</li>
-            {% if stats.pure_annex_ds_stats.annexed_file_count is not none %}
-              <li>Annexed file
-                count: {{ stats.pure_annex_ds_stats.annexed_file_count }}</li>
-            {% endif %}
-            {% if stats.pure_annex_ds_stats.annexed_files_size is not none %}
-              <li>Annexed files
-                size: {{ stats.pure_annex_ds_stats.annexed_files_size }}</li>
-            {% endif %}
-          </ul>
+          {{ render_annex_ds_collection_stats(stats.pure_annex_ds_stats) }}
         </li>
         <li>
           <h3>Non-Annex Dataset Stats</h3>

From 4ea58920593dcc96b9d3391bbd94cb654071bd09 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 14:13:47 -0700
Subject: [PATCH 37/57] Add stats trigger button

---
 datalad_registry/templates/overview.html | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 685ffb10..8ac368b6 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -193,6 +193,9 @@ <h2>Search query syntax</h2>
       {{ render_pagination_widget(pagination, '.overview') }}
     </div>
 
+    <!-- Stats Trigger Button -->
+    <button id="statsModalBtn">Show Stats</button>
+
     <!-- Statistics of the returned collection of datasets -->
     <div id="stats">
       <h2>Stats</h2>

From eb97bafe18a99e0c208cd42efd8d18625dd2e8d4 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 14:34:18 -0700
Subject: [PATCH 38/57] Enclose stats in a `modal` element

---
 datalad_registry/templates/overview.html | 66 +++++++++++++-----------
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 8ac368b6..a01c988c 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -197,41 +197,49 @@ <h2>Search query syntax</h2>
     <button id="statsModalBtn">Show Stats</button>
 
     <!-- Statistics of the returned collection of datasets -->
-    <div id="stats">
-      <h2>Stats</h2>
-      <ul>
-        <li>
-          <h3>Datalad Datasets Stats</h3>
+    <div id="statsModal" class="modal">
+      <div class="modal-content">
+        <div class="modal-header">
+          <span class="close">&times;</span>
+          <h2>Stats</h2>
+        </div>
+        <div id="stats" class="modal-body">
+          <h2>Stats</h2>
           <ul>
             <li>
-              <h4>Unique Datalad Dataset Stats</h4>
-              {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.unique_ds_stats) }}
+              <h3>Datalad Datasets Stats</h3>
+              <ul>
+                <li>
+                  <h4>Unique Datalad Dataset Stats</h4>
+                  {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.unique_ds_stats) }}
+                </li>
+                <li>
+                  <h4>Stats without deduplication</h4>
+                  {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.stats) }}
+                </li>
+              </ul>
             </li>
             <li>
-              <h4>Stats without deduplication</h4>
-              {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.stats) }}
+              <h3>Pure Annex Dataset Stats</h3>
+              {{ render_annex_ds_collection_stats(stats.pure_annex_ds_stats) }}
+            </li>
+            <li>
+              <h3>Non-Annex Dataset Stats</h3>
+              <ul>
+                <li>Count: {{ stats.non_annex_ds_stats.ds_count }}</li>
+              </ul>
+            </li>
+            <li>
+              <h3>Summary</h3>
+              <ul>
+                <li>Unique dataset count: {{ stats.summary.unique_ds_count }}</li>
+                <li>Total dataset count (without
+                  deduplication): {{ stats.summary.ds_count }}</li>
+              </ul>
             </li>
           </ul>
-        </li>
-        <li>
-          <h3>Pure Annex Dataset Stats</h3>
-          {{ render_annex_ds_collection_stats(stats.pure_annex_ds_stats) }}
-        </li>
-        <li>
-          <h3>Non-Annex Dataset Stats</h3>
-          <ul>
-            <li>Count: {{ stats.non_annex_ds_stats.ds_count }}</li>
-          </ul>
-        </li>
-        <li>
-          <h3>Summary</h3>
-          <ul>
-            <li>Unique dataset count: {{ stats.summary.unique_ds_count }}</li>
-            <li>Total dataset count (without
-              deduplication): {{ stats.summary.ds_count }}</li>
-          </ul>
-        </li>
-      </ul>
+        </div>
+      </div>
     </div>
   </div>
 </div>

From 1711619a6fc3b2a0b8a1fa522d4186fc3f8da17c Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 17:00:00 -0700
Subject: [PATCH 39/57] Rename html elements and improve comment

So new similar elements can be distinguished
---
 datalad_registry/templates/overview.html | 28 ++++++++++++------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index a01c988c..5b3ec8f7 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -245,29 +245,29 @@ <h3>Summary</h3>
 </div>
 
 <script>
-// Get the modal
-const modal = document.getElementById("QuerySyntaxModal");
+// Get the syntax modal
+const syntax_modal = document.getElementById("QuerySyntaxModal");
 
-// Get the button that opens the modal
-const btn = document.getElementById("showQuerySyntax");
+// Get the button that opens the syntax modal
+const show_syntax_btn = document.getElementById("showQuerySyntax");
 
-// Get the <span> element that closes the modal
-const span = document.getElementsByClassName("close")[0];
+// Get the <span> element that closes the syntax modal
+const close_syntax_span = document.getElementsByClassName("close")[0];
 
-// When the user clicks the button, open the modal
-btn.onclick = function() {
-  modal.style.display = "block";
+// When the user clicks the show syntax button, open the syntax modal
+show_syntax_btn.onclick = function() {
+  syntax_modal.style.display = "block";
 }
 
-// When the user clicks on <span> (x), close the modal
-span.onclick = function() {
-  modal.style.display = "none";
+// When the user clicks on <span> (x), close the syntax modal
+close_syntax_span.onclick = function() {
+  syntax_modal.style.display = "none";
 }
 
 // When the user clicks anywhere outside the modal, close it
 window.onclick = function(event) {
-  if (event.target === modal) {
-    modal.style.display = "none";
+  if (event.target === syntax_modal) {
+    syntax_modal.style.display = "none";
   }
 }
 

From c6a24effc3b194c61a3d949e0b8aa52111062220 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 17:08:25 -0700
Subject: [PATCH 40/57] Improve comments

---
 datalad_registry/templates/overview.html | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 5b3ec8f7..45c30355 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -75,9 +75,8 @@
     {%- endif %}
     <button id="showQuerySyntax">Show search query syntax</button>
 
+    <!-- Syntax modal -->
     <div id="QuerySyntaxModal" class="modal">
-
-    <!-- Modal content -->
       <div class="modal-content">
         <div class="modal-header">
           <span class="close">&times;</span>
@@ -196,7 +195,7 @@ <h2>Search query syntax</h2>
     <!-- Stats Trigger Button -->
     <button id="statsModalBtn">Show Stats</button>
 
-    <!-- Statistics of the returned collection of datasets -->
+    <!-- Statistics modal -->
     <div id="statsModal" class="modal">
       <div class="modal-content">
         <div class="modal-header">

From df5fbb6702eaf67f0d34ac0fca9d5c8fa14dffcc Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 17:12:48 -0700
Subject: [PATCH 41/57] Give the modal close HTML elements IDs

---
 datalad_registry/templates/overview.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 45c30355..4ccf226d 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -79,7 +79,7 @@
     <div id="QuerySyntaxModal" class="modal">
       <div class="modal-content">
         <div class="modal-header">
-          <span class="close">&times;</span>
+          <span id="CloseSyntax" class="close">&times;</span>
           <h2>Search query syntax</h2>
         </div>
         <div class="modal-body">
@@ -199,7 +199,7 @@ <h2>Search query syntax</h2>
     <div id="statsModal" class="modal">
       <div class="modal-content">
         <div class="modal-header">
-          <span class="close">&times;</span>
+          <span id="CloseStats" class="close">&times;</span>
           <h2>Stats</h2>
         </div>
         <div id="stats" class="modal-body">

From 9354367764a493bf4be6130570aa45aadc043400 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 17:14:25 -0700
Subject: [PATCH 42/57] Fetch close syntax element by ID

---
 datalad_registry/templates/overview.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 4ccf226d..397f5488 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -251,7 +251,7 @@ <h3>Summary</h3>
 const show_syntax_btn = document.getElementById("showQuerySyntax");
 
 // Get the <span> element that closes the syntax modal
-const close_syntax_span = document.getElementsByClassName("close")[0];
+const close_syntax_span = document.getElementById("CloseSyntax");
 
 // When the user clicks the show syntax button, open the syntax modal
 show_syntax_btn.onclick = function() {

From 1d3eb0d4a0a68f64d6f54c683acb02a76b72ccf7 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 17:38:54 -0700
Subject: [PATCH 43/57] Rename HTML element IDs to achieve consistency

---
 datalad_registry/templates/overview.html | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 397f5488..25337d63 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -73,7 +73,7 @@
         <input type='submit' value='Clear'/>
         </form>
     {%- endif %}
-    <button id="showQuerySyntax">Show search query syntax</button>
+    <button id="ShowQuerySyntax">Show search query syntax</button>
 
     <!-- Syntax modal -->
     <div id="QuerySyntaxModal" class="modal">
@@ -193,10 +193,10 @@ <h2>Search query syntax</h2>
     </div>
 
     <!-- Stats Trigger Button -->
-    <button id="statsModalBtn">Show Stats</button>
+    <button id="ShowStats">Show Stats</button>
 
     <!-- Statistics modal -->
-    <div id="statsModal" class="modal">
+    <div id="StatsModal" class="modal">
       <div class="modal-content">
         <div class="modal-header">
           <span id="CloseStats" class="close">&times;</span>
@@ -248,7 +248,7 @@ <h3>Summary</h3>
 const syntax_modal = document.getElementById("QuerySyntaxModal");
 
 // Get the button that opens the syntax modal
-const show_syntax_btn = document.getElementById("showQuerySyntax");
+const show_syntax_btn = document.getElementById("ShowQuerySyntax");
 
 // Get the <span> element that closes the syntax modal
 const close_syntax_span = document.getElementById("CloseSyntax");

From 8e0f2e1aa738b8abb0990d8dfe7e01ebc8f8edc2 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 17:41:23 -0700
Subject: [PATCH 44/57] Add logic to control the stats modal window

---
 datalad_registry/templates/overview.html | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 25337d63..6782c54a 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -263,11 +263,33 @@ <h3>Summary</h3>
   syntax_modal.style.display = "none";
 }
 
+// Get the stats modal
+const stats_modal = document.getElementById("StatsModal");
+
+// Get the button that opens the stats modal
+const show_stats_btn = document.getElementById("ShowStats");
+
+// Get the <span> element that closes the stats modal
+const close_stats_span = document.getElementById("CloseStats");
+
+// When the user clicks the show stats button, open the syntax modal
+show_stats_btn.onclick = function() {
+  stats_modal.style.display = "block";
+}
+
+// When the user clicks on <span> (x), close the stats modal
+close_stats_span.onclick = function() {
+  stats_modal.style.display = "none";
+}
+
 // When the user clicks anywhere outside the modal, close it
 window.onclick = function(event) {
   if (event.target === syntax_modal) {
     syntax_modal.style.display = "none";
   }
+  if (event.target === stats_modal) {
+    stats_modal.style.display = "none";
+  }
 }
 
 function span_copy() {

From d7d9d96970e4a975f2ced0072a572dbda65ce772 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 17:45:07 -0700
Subject: [PATCH 45/57] Remove the header inside modal body

The modal already provides a header
---
 datalad_registry/templates/overview.html | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 6782c54a..f6554f64 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -203,7 +203,6 @@ <h2>Search query syntax</h2>
           <h2>Stats</h2>
         </div>
         <div id="stats" class="modal-body">
-          <h2>Stats</h2>
           <ul>
             <li>
               <h3>Datalad Datasets Stats</h3>

From 0e45ac3fff5267f9dae0d1db498b51db30d16f46 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 18:27:27 -0700
Subject: [PATCH 46/57] Remove duplication in modal related code

---
 datalad_registry/templates/overview.html | 68 +++++++++---------------
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index f6554f64..be59b477 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -243,53 +243,37 @@ <h3>Summary</h3>
 </div>
 
 <script>
-// Get the syntax modal
-const syntax_modal = document.getElementById("QuerySyntaxModal");
+    function setupModal(modalId, btnShowId, btnCloseId) {
+        const modal = document.getElementById(modalId);
+        const btnShow = document.getElementById(btnShowId);
+        const btnClose = document.getElementById(btnCloseId);
 
-// Get the button that opens the syntax modal
-const show_syntax_btn = document.getElementById("ShowQuerySyntax");
+        // When the user clicks the button, open the modal
+        btnShow.onclick = function () {
+            modal.style.display = "block";
+        }
 
-// Get the <span> element that closes the syntax modal
-const close_syntax_span = document.getElementById("CloseSyntax");
+        // When the user clicks on <span> (x), close the modal
+        btnClose.onclick = function () {
+            modal.style.display = "none";
+        }
 
-// When the user clicks the show syntax button, open the syntax modal
-show_syntax_btn.onclick = function() {
-  syntax_modal.style.display = "block";
-}
-
-// When the user clicks on <span> (x), close the syntax modal
-close_syntax_span.onclick = function() {
-  syntax_modal.style.display = "none";
-}
+        return modal;
+    }
 
-// Get the stats modal
-const stats_modal = document.getElementById("StatsModal");
+    // Setup modals
+    const modals = [];
+    modals.push(setupModal('QuerySyntaxModal', 'ShowQuerySyntax', 'CloseSyntax'));
+    modals.push(setupModal('StatsModal', 'ShowStats', 'CloseStats'));
 
-// Get the button that opens the stats modal
-const show_stats_btn = document.getElementById("ShowStats");
-
-// Get the <span> element that closes the stats modal
-const close_stats_span = document.getElementById("CloseStats");
-
-// When the user clicks the show stats button, open the syntax modal
-show_stats_btn.onclick = function() {
-  stats_modal.style.display = "block";
-}
-
-// When the user clicks on <span> (x), close the stats modal
-close_stats_span.onclick = function() {
-  stats_modal.style.display = "none";
-}
-
-// When the user clicks anywhere outside the modal, close it
-window.onclick = function(event) {
-  if (event.target === syntax_modal) {
-    syntax_modal.style.display = "none";
-  }
-  if (event.target === stats_modal) {
-    stats_modal.style.display = "none";
-  }
-}
+    // When the user clicks anywhere outside the modal, close it
+    window.onclick = function (event) {
+        modals.forEach(function (modal) {
+            if (event.target === modal) {
+                modal.style.display = "none";
+            }
+        });
+    }
 
 function span_copy() {
   // Find all span elements with the class 'query'

From 9fdb60c8c8ea6cc77523d7d24a02da92780c6731 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Fri, 19 Apr 2024 19:14:31 -0700
Subject: [PATCH 47/57] Add unit to file size

---
 datalad_registry/templates/overview.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index be59b477..7fac2cfc 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -36,7 +36,7 @@
     {% endif %}
     {% if annex_ds_col_stats.annexed_files_size is not none %}
       <li>Annexed files
-        size: {{ annex_ds_col_stats.annexed_files_size }}</li>
+        size: {{ annex_ds_col_stats.annexed_files_size }} bytes</li>
     {% endif %}
   </ul>
 {% endmacro %}

From 2d332bce30f791380b027a1628a510c6c6fae0e1 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Mon, 22 Apr 2024 11:32:26 -0700
Subject: [PATCH 48/57] Provide a stats summary sentence at overview page

---
 datalad_registry/templates/overview.html | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 7fac2cfc..90cc3493 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -192,8 +192,18 @@ <h2>Search query syntax</h2>
       {{ render_pagination_widget(pagination, '.overview') }}
     </div>
 
-    <!-- Stats Trigger Button -->
-    <button id="ShowStats">Show Stats</button>
+
+    <div>
+      Combined Nr of Annexed
+      files: {{ (stats.datalad_ds_stats.stats.annexed_file_count if stats.datalad_ds_stats.stats.annexed_file_count else 0) +
+        (stats.pure_annex_ds_stats.annexed_file_count if stats.pure_annex_ds_stats.annexed_file_count else 0) }}
+      Combined Annexed file
+      size: {{ (stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0) }}
+
+      <!-- Stats Trigger Button -->
+      <button id="ShowStats">Show Stats</button>
+    </div>
+
 
     <!-- Statistics modal -->
     <div id="StatsModal" class="modal">

From c504a0fc0fc044c07a349510435681552e82f722 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Mon, 22 Apr 2024 11:34:56 -0700
Subject: [PATCH 49/57] Modify the show stats button content

---
 datalad_registry/templates/overview.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 90cc3493..ad567d0d 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -201,7 +201,7 @@ <h2>Search query syntax</h2>
       size: {{ (stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0) }}
 
       <!-- Stats Trigger Button -->
-      <button id="ShowStats">Show Stats</button>
+      <button id="ShowStats">Show Details</button>
     </div>
 
 

From 2dc1406ffc0e8a09bbc494bade555160813fcf86 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Mon, 22 Apr 2024 18:15:12 -0700
Subject: [PATCH 50/57] Humanize file size expression in stats

---
 datalad_registry/templates/overview.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index ad567d0d..f220ff02 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -36,7 +36,7 @@
     {% endif %}
     {% if annex_ds_col_stats.annexed_files_size is not none %}
       <li>Annexed files
-        size: {{ annex_ds_col_stats.annexed_files_size }} bytes</li>
+        size: {{ annex_ds_col_stats.annexed_files_size|filesizeformat }} </li>
     {% endif %}
   </ul>
 {% endmacro %}
@@ -198,7 +198,7 @@ <h2>Search query syntax</h2>
       files: {{ (stats.datalad_ds_stats.stats.annexed_file_count if stats.datalad_ds_stats.stats.annexed_file_count else 0) +
         (stats.pure_annex_ds_stats.annexed_file_count if stats.pure_annex_ds_stats.annexed_file_count else 0) }}
       Combined Annexed file
-      size: {{ (stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0) }}
+      size: {{ ((stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0))|filesizeformat }}
 
       <!-- Stats Trigger Button -->
       <button id="ShowStats">Show Details</button>

From 003481ba7aeeb55c5a73f3bdd4b78b82347361f8 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Mon, 22 Apr 2024 21:57:46 -0700
Subject: [PATCH 51/57] Humanize counts in stats

---
 datalad_registry/overview.py             |  9 +++++++++
 datalad_registry/templates/overview.html | 14 +++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py
index 4e151abd..8794fac6 100644
--- a/datalad_registry/overview.py
+++ b/datalad_registry/overview.py
@@ -4,6 +4,7 @@
 import logging
 
 from flask import Blueprint, render_template, request
+from humanize import intcomma as h_intcomma
 from sqlalchemy import nullslast, select
 
 from datalad_registry.blueprints.api.dataset_urls.tools import get_collection_stats
@@ -29,6 +30,14 @@
 }
 
 
+@bp.app_template_filter("intcomma")
+def intcomma(value, n_digits=None):
+    """
+    Wrapper around humanize.intcomma to be used in templates
+    """
+    return h_intcomma(value, n_digits)
+
+
 @bp.get("/")
 def overview():  # No type hints due to mypy#7187.
     default_sort_scheme = "update-desc"
diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index f220ff02..3f0a8af6 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -29,10 +29,10 @@
 
 {% macro render_annex_ds_collection_stats(annex_ds_col_stats) %}
   <ul>
-    <li>Count: {{ annex_ds_col_stats.ds_count }}</li>
+    <li>Count: {{ annex_ds_col_stats.ds_count|intcomma }}</li>
     {% if annex_ds_col_stats.annexed_file_count is not none %}
       <li>Annexed file
-        count: {{ annex_ds_col_stats.annexed_file_count }}</li>
+        count: {{ annex_ds_col_stats.annexed_file_count|intcomma }}</li>
     {% endif %}
     {% if annex_ds_col_stats.annexed_files_size is not none %}
       <li>Annexed files
@@ -195,8 +195,8 @@ <h2>Search query syntax</h2>
 
     <div>
       Combined Nr of Annexed
-      files: {{ (stats.datalad_ds_stats.stats.annexed_file_count if stats.datalad_ds_stats.stats.annexed_file_count else 0) +
-        (stats.pure_annex_ds_stats.annexed_file_count if stats.pure_annex_ds_stats.annexed_file_count else 0) }}
+      files: {{ ((stats.datalad_ds_stats.stats.annexed_file_count if stats.datalad_ds_stats.stats.annexed_file_count else 0) +
+        (stats.pure_annex_ds_stats.annexed_file_count if stats.pure_annex_ds_stats.annexed_file_count else 0))|intcomma }}
       Combined Annexed file
       size: {{ ((stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0))|filesizeformat }}
 
@@ -234,15 +234,15 @@ <h3>Pure Annex Dataset Stats</h3>
             <li>
               <h3>Non-Annex Dataset Stats</h3>
               <ul>
-                <li>Count: {{ stats.non_annex_ds_stats.ds_count }}</li>
+                <li>Count: {{ stats.non_annex_ds_stats.ds_count|intcomma }}</li>
               </ul>
             </li>
             <li>
               <h3>Summary</h3>
               <ul>
-                <li>Unique dataset count: {{ stats.summary.unique_ds_count }}</li>
+                <li>Unique dataset count: {{ stats.summary.unique_ds_count|intcomma }}</li>
                 <li>Total dataset count (without
-                  deduplication): {{ stats.summary.ds_count }}</li>
+                  deduplication): {{ stats.summary.ds_count|intcomma }}</li>
               </ul>
             </li>
           </ul>

From 000a30d1f13fba67b5326ace1de9ec425f022055 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Mon, 22 Apr 2024 22:41:04 -0700
Subject: [PATCH 52/57] Position model content using `top`

This eliminates the possibility of the element
having its top portion being cut off when the
window is small
---
 datalad_registry/static/main.css | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/datalad_registry/static/main.css b/datalad_registry/static/main.css
index 3c24aedd..06ba9fc4 100644
--- a/datalad_registry/static/main.css
+++ b/datalad_registry/static/main.css
@@ -120,7 +120,7 @@ div#datalad-registry div.pagination {
 /* Modal Content */
 .modal-content {
     position: fixed;
-    bottom: 50%;
+    top: 10%;
     background-color: #fefefe;
     width: 100%;
     -webkit-animation-name: slideIn;
@@ -131,13 +131,25 @@ div#datalad-registry div.pagination {
 
 /* Add Animation */
 @-webkit-keyframes slideIn {
-    from {bottom: -300px; opacity: 0}
-    to {bottom: 50%; opacity: 1}
+    from {
+        top: 50%;
+        opacity: 0
+    }
+    to {
+        top: 10%;
+        opacity: 1
+    }
 }
 
 @keyframes slideIn {
-    from {bottom: -300px; opacity: 0}
-    to {bottom: 50%; opacity: 1}
+    from {
+        top: 50%;
+        opacity: 0
+    }
+    to {
+        top: 10%;
+        opacity: 1
+    }
 }
 
 @-webkit-keyframes fadeIn {

From b5ebb55c5e6f0dad9e8c17b4ec31f007deea3209 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Tue, 23 Apr 2024 12:01:45 -0700
Subject: [PATCH 53/57] Register `humanize.intcomma` as Jinja2 filter

A wrapper is actually not needed, so this
direct approach is better.
---
 datalad_registry/overview.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py
index 8794fac6..7aadcc39 100644
--- a/datalad_registry/overview.py
+++ b/datalad_registry/overview.py
@@ -4,7 +4,7 @@
 import logging
 
 from flask import Blueprint, render_template, request
-from humanize import intcomma as h_intcomma
+from humanize import intcomma
 from sqlalchemy import nullslast, select
 
 from datalad_registry.blueprints.api.dataset_urls.tools import get_collection_stats
@@ -30,12 +30,8 @@
 }
 
 
-@bp.app_template_filter("intcomma")
-def intcomma(value, n_digits=None):
-    """
-    Wrapper around humanize.intcomma to be used in templates
-    """
-    return h_intcomma(value, n_digits)
+# Register humanize.intcomma as a Jinja2 filter
+bp.add_app_template_filter(intcomma, "intcomma")
 
 
 @bp.get("/")

From 6460967812891cf9ec585ffae37f3334568241fc Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Tue, 23 Apr 2024 12:30:22 -0700
Subject: [PATCH 54/57] Add some spacing between elements in the gist of stats

---
 datalad_registry/templates/overview.html | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 3f0a8af6..7db86b78 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -194,11 +194,11 @@ <h2>Search query syntax</h2>
 
 
     <div>
-      Combined Nr of Annexed
+      <span style="margin-right: 10px">Combined Nr of Annexed
       files: {{ ((stats.datalad_ds_stats.stats.annexed_file_count if stats.datalad_ds_stats.stats.annexed_file_count else 0) +
-        (stats.pure_annex_ds_stats.annexed_file_count if stats.pure_annex_ds_stats.annexed_file_count else 0))|intcomma }}
-      Combined Annexed file
-      size: {{ ((stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0))|filesizeformat }}
+        (stats.pure_annex_ds_stats.annexed_file_count if stats.pure_annex_ds_stats.annexed_file_count else 0))|intcomma }}</span>
+      <span style="margin-right: 5px">Combined Annexed file
+        size: {{ ((stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0))|filesizeformat }}</span>
 
       <!-- Stats Trigger Button -->
       <button id="ShowStats">Show Details</button>

From 1d07dcfaa6e4b2c2ba3700a3bbe6fd1de1217252 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 25 Apr 2024 00:13:43 -0700
Subject: [PATCH 55/57] Implement func to cache result of select state to temp
 table

---
 .../blueprints/api/dataset_urls/tools.py      | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py
index 8c1a8b55..867e6cee 100644
--- a/datalad_registry/blueprints/api/dataset_urls/tools.py
+++ b/datalad_registry/blueprints/api/dataset_urls/tools.py
@@ -1,4 +1,16 @@
-from sqlalchemy import Select, Subquery, and_, func, not_, or_, select
+from sqlalchemy import (
+    Select,
+    Subquery,
+    TableClause,
+    and_,
+    column,
+    func,
+    not_,
+    or_,
+    select,
+    table,
+    text,
+)
 
 from datalad_registry.models import RepoUrl, db
 
@@ -11,6 +23,29 @@
 )
 
 
+def cache_result_to_tmp_tb(select_stmt: Select, tb_name: str) -> TableClause:
+    """
+    Execute the given select statement and cache the result to a temporary table
+    with the given name
+
+    :param select_stmt: The given select statement to execute
+    :param tb_name: The string to use as the name of the temporary table
+    :return: A object representing the temporary table
+
+    Note: The execution of this function requires the Flask app's context
+    """
+    create_tmp_tb_sql = f"""
+        CREATE TEMPORARY TABLE {tb_name} AS
+        {select_stmt.compile(bind=db.engine, compile_kwargs={'literal_binds': True})};
+    """
+    db.session.execute(text(create_tmp_tb_sql))
+
+    return table(
+        tb_name,
+        *(column(name, c.type) for name, c in select_stmt.selected_columns.items()),
+    )
+
+
 def _get_annex_ds_collection_stats(q: Subquery) -> AnnexDsCollectionStats:
     """
     Get the stats of a collection of datasets that contains only of annex datasets

From ebe897f079e2c5a2dc89002e3f782d10f9aa38f6 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 25 Apr 2024 01:00:52 -0700
Subject: [PATCH 56/57] Use a temporary table for calculating stats

The temporary table is used to store the results of
a search/query so that it doesn't have to re-executed
repeatedly in calculating the stats
---
 datalad_registry/blueprints/api/dataset_urls/tools.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/datalad_registry/blueprints/api/dataset_urls/tools.py b/datalad_registry/blueprints/api/dataset_urls/tools.py
index 867e6cee..6d518cb1 100644
--- a/datalad_registry/blueprints/api/dataset_urls/tools.py
+++ b/datalad_registry/blueprints/api/dataset_urls/tools.py
@@ -215,7 +215,11 @@ def get_collection_stats(select_stmt: Select) -> CollectionStats:
     Note: The execution of this function requires the Flask app's context
     """
 
-    base_q = select_stmt.subquery("base_q")
+    # Cache the result of the select statement to a temporary table
+    tmp_tb = cache_result_to_tmp_tb(select_stmt, "tmp_tb")
+
+    # base_q = select_stmt.subquery("base_q")
+    base_q = select(tmp_tb).subquery("base_q")
 
     datalad_ds_stats = get_dl_ds_collection_stats(base_q)
 

From 49e2e6135adc408ca3570fceb3fe87c2966cad64 Mon Sep 17 00:00:00 2001
From: Isaac To <candleindark@gmail.com>
Date: Thu, 25 Apr 2024 12:14:55 -0700
Subject: [PATCH 57/57] Improve UI stats presentation per @yarikoptic's
 recommendations

---
 datalad_registry/templates/overview.html | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html
index 7db86b78..53a0bd6e 100644
--- a/datalad_registry/templates/overview.html
+++ b/datalad_registry/templates/overview.html
@@ -194,14 +194,14 @@ <h2>Search query syntax</h2>
 
 
     <div>
-      <span style="margin-right: 10px">Combined Nr of Annexed
+      <span style="margin-right: 10px"># of annexed
       files: {{ ((stats.datalad_ds_stats.stats.annexed_file_count if stats.datalad_ds_stats.stats.annexed_file_count else 0) +
         (stats.pure_annex_ds_stats.annexed_file_count if stats.pure_annex_ds_stats.annexed_file_count else 0))|intcomma }}</span>
-      <span style="margin-right: 5px">Combined Annexed file
-        size: {{ ((stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0))|filesizeformat }}</span>
+      <span style="margin-right: 5px">Combined size of annexed
+        files: {{ ((stats.datalad_ds_stats.stats.annexed_files_size if stats.datalad_ds_stats.stats.annexed_files_size else 0) + (stats.pure_annex_ds_stats.annexed_files_size if stats.pure_annex_ds_stats.annexed_files_size else 0))|filesizeformat }}</span>
 
       <!-- Stats Trigger Button -->
-      <button id="ShowStats">Show Details</button>
+      <button id="ShowStats">Show details</button>
     </div>
 
 
@@ -215,10 +215,10 @@ <h2>Stats</h2>
         <div id="stats" class="modal-body">
           <ul>
             <li>
-              <h3>Datalad Datasets Stats</h3>
+              <h3>DataLad datasets stats</h3>
               <ul>
                 <li>
-                  <h4>Unique Datalad Dataset Stats</h4>
+                  <h4>Unique DataLad dataset stats</h4>
                   {{ render_annex_ds_collection_stats(stats.datalad_ds_stats.unique_ds_stats) }}
                 </li>
                 <li>
@@ -228,11 +228,11 @@ <h4>Stats without deduplication</h4>
               </ul>
             </li>
             <li>
-              <h3>Pure Annex Dataset Stats</h3>
+              <h3>Pure annex repositories stats</h3>
               {{ render_annex_ds_collection_stats(stats.pure_annex_ds_stats) }}
             </li>
             <li>
-              <h3>Non-Annex Dataset Stats</h3>
+              <h3>Non-annex repositories stats</h3>
               <ul>
                 <li>Count: {{ stats.non_annex_ds_stats.ds_count|intcomma }}</li>
               </ul>