merge develop into issue-169 branch

nasa · Jul 10, 2024 · 01b73ea · 01b73ea
2 parents 10c086a + d087507
commit 01b73ea
Show file tree

Hide file tree

Showing 34 changed files with 1,251 additions and 556 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -4,11 +4,19 @@ updates:
     directory: "/"
     schedule:
       interval: "monthly"
+    groups:
+      pip-dependencies:
+        patterns:
+          - "*"
     # Raise pull requests for version updates
     # to pip against the `develop` branch
     target-branch: "develop"
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
       interval: "monthly"
+    groups:
+      gha-dependencies:
+        patterns:
+          - "*"
     target-branch: "develop"
diff --git a/.github/workflows/build-pipeline.yml b/.github/workflows/build-pipeline.yml
@@ -170,7 +170,7 @@ jobs:
       - name: Build and push Docker image
         if: ${{ !startsWith(github.ref, 'refs/heads/main/') }}
         id: docker-push
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           file: Dockerfile

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -36,7 +36,7 @@ jobs:
           poetry-version: ${{ env.POETRY_VERSION }}
 
       - name: Install package
-        run: poetry install
+        run: poetry install --with=harmony --without integration
 
       - name: Run linting
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,10 @@
 ---
+ci:
+  autoupdate_schedule: "monthly" # Like dependabot
+  autoupdate_commit_msg: "chore: update pre-commit hooks"
+  autoupdate_branch: "develop"
+  autofix_prs: false # Comment "pre-commit.ci autofix" on a PR to trigger
+
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.6.0

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [Issue #181](https://github.com/nasa/stitchee/issues/181): Add a group delimiter argument
 - [Issue #134](https://github.com/nasa/stitchee/issues/134): Add an integration test that runs stitchee on files first subsetted by the operational Harmony subsetter
 - [Issue #194](https://github.com/nasa/stitchee/issues/194): Add page about the SAMBAH service chain to the Readthedocs documentation
+- [issue #193](https://github.com/nasa/stitchee/issues/193): Add autoupdate schedule for pre-commit
 ### Changed
+- [Issue #206](https://github.com/nasa/stitchee/issues/206): Group dependabot updates into one PR
+- [issue #208](https://github.com/nasa/stitchee/issues/208): Increase continuous integration/unit test coverage
+- [issue #198](https://github.com/nasa/stitchee/issues/198): Use time variable instead of concat dim for ordering datasets
 ### Deprecated
 ### Removed
 ### Fixed

diff --git a/concatenator/__init__.py b/concatenator/__init__.py
@@ -24,24 +24,9 @@ def __getattr__(name):  # type: ignore
     """
     global _options
 
-    if name == "__options__":
-        return _options
     if name == "group_delim":
         return _options.group_delim
     if name == "coord_delim":
         return _options.coord_delim
     else:
         raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
-def __setattr__(name, value):  # type: ignore
-    """Module-level setattr to handle setting of `concatenator.options`.
-
-    Other unhandled attributes raise as `AttributeError` as expected.
-    """
-    if name == "group_delim":
-        _options.group_delim = value
-    elif name == "coord_delim":
-        _options.coord_delim = value
-    else:
-        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/concatenator/attribute_handling.py b/concatenator/attribute_handling.py
@@ -26,7 +26,7 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
     Examples
     --------
     >>> coord_att = "__Time_and_Position__time  __Time_and_Position__instrument_fov_latitude  __Time_and_Position__instrument_fov_longitude"
-    >>> _flatten_coordinate_attribute(coord_att)
+    >>> flatten_string_with_groups(coord_att)
         Time_and_Position/time  Time_and_Position/instrument_fov_latitude  Time_and_Position/instrument_fov_longitude
 
     Parameters
@@ -54,44 +54,48 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
 def flatten_coordinate_attribute_paths(
     dataset: netCDF4.Dataset, var: netCDF4.Variable, variable_name: str
 ) -> None:
-    """Flatten the paths of variables referenced in the coordinates attribute."""
+    """Flatten the paths of variables referenced in the 'coordinates' attribute."""
     if "coordinates" in var.ncattrs():
         coord_att = var.getncattr("coordinates")
 
-        new_coord_att = _flatten_coordinate_attribute(coord_att)
+        new_coord_att = flatten_string_with_groups(coord_att)
 
         dataset.variables[variable_name].setncattr("coordinates", new_coord_att)
 
 
-def _flatten_coordinate_attribute(attribute_string: str) -> str:
-    """Converts attributes that specify group membership via "/" to use new group delimiter, even for the root level.
+def flatten_string_with_groups(str_with_groups: str) -> str:
+    """Determine separator and flatten string specifying group membership via "/".
+
+    Applies to variable paths or attributes, even for the root level.
 
     Examples
     --------
     >>> coord_att = "Time_and_Position/time  Time_and_Position/instrument_fov_latitude  Time_and_Position/instrument_fov_longitude"
-    >>> _flatten_coordinate_attribute(coord_att)
+    >>> flatten_string_with_groups(coord_att)
         __Time_and_Position__time  __Time_and_Position__instrument_fov_latitude  __Time_and_Position__instrument_fov_longitude
 
     Parameters
     ----------
-    attribute_string : str
+    str_with_groups : str
 
     Returns
     -------
     str
     """
     # Use the separator that's in the attribute string only if all separators in the string are the same.
     # Otherwise, we will use our own default separator.
-    whitespaces = re.findall(r"\s+", attribute_string)
-    if len(set(whitespaces)) <= 1:
+    whitespaces = re.findall(r"\s+", str_with_groups)
+    if len(set(whitespaces)) == 0:
+        new_sep = ""
+    elif len(set(whitespaces)) == 1:
         new_sep = whitespaces[0]
     else:
         new_sep = concatenator.coord_delim
 
     # A new string is constructed.
     return new_sep.join(
         f'{concatenator.group_delim}{c.replace("/", concatenator.group_delim)}'
-        for c in attribute_string.split()  # split on any whitespace
+        for c in str_with_groups.split()  # split on any whitespace
     )
 
 

diff --git a/concatenator/dataset_and_group_handling.py b/concatenator/dataset_and_group_handling.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import re
+from logging import Logger
 
 import netCDF4 as nc
 import numpy as np
@@ -273,7 +274,7 @@ def _get_nested_group(dataset: nc.Dataset, group_path: str) -> nc.Group:
     return nested_group
 
 
-def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize=4000) -> tuple:
+def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize: int = 4000) -> tuple:
     """
     For the given dataset, calculate if the size on any dimension is
     worth chunking. Any dimension larger than 4000 will be chunked. This
@@ -324,8 +325,8 @@ def _get_dimension_size(dataset: nc.Dataset, dim_name: str) -> int:
     return dim_size
 
 
-def validate_workable_files(files, logger) -> tuple[list[str], int]:
-    """Remove files from list that are not open-able as netCDF or that are empty."""
+def validate_workable_files(files: list[str], logger: Logger) -> tuple[list[str], int]:
+    """Remove files from a list that are not open-able as netCDF or that are empty."""
     workable_files = []
     for file in files:
         try:
@@ -336,7 +337,7 @@ def validate_workable_files(files, logger) -> tuple[list[str], int]:
         except OSError:
             logger.debug("Error opening <%s> as a netCDF dataset. Skipping.", file)
 
-    # addressing the issue 153: propagate first empty file if all input files are empty
+    # addressing GitHub issue 153: propagate the first empty file if all input files are empty
     if (len(workable_files)) == 0 and (len(files) > 0):
         workable_files.append(files[0])
 

diff --git a/concatenator/harmony/cli.py b/concatenator/harmony/cli.py
@@ -1,4 +1,5 @@
 """A Harmony CLI wrapper around the concatenate-batcher"""
+
 from argparse import ArgumentParser
 
 import harmony

diff --git a/concatenator/harmony/download_worker.py b/concatenator/harmony/download_worker.py
@@ -113,6 +113,6 @@ def _download_worker(
             dest_path = path.parent.joinpath(filename)
             path = path.rename(dest_path)
         else:
-            logger.warning("Origin filename could not be assertained - %s", url)
+            logger.warning("Origin filename could not be ascertained - %s", url)
 
         path_list.append(str(path))
diff --git a/concatenator/stitchee.py b/concatenator/stitchee.py
@@ -15,6 +15,7 @@
 import xarray as xr
 
 import concatenator
+from concatenator.attribute_handling import flatten_string_with_groups
 from concatenator.dataset_and_group_handling import (
     flatten_grouped_dataset,
     regroup_flattened_dataset,
@@ -39,6 +40,7 @@ def stitchee(
     concat_method: str | None = "xarray-concat",
     concat_dim: str = "",
     concat_kwargs: dict | None = None,
+    time_variable: str = "geolocation/time",
     history_to_append: str | None = None,
     copy_input_files: bool = False,
     overwrite_output_file: bool = False,
@@ -137,9 +139,11 @@ def stitchee(
                         decode_coords=False,
                         drop_variables=coord_vars,
                     ) as xrds:
-                        first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[
-                            0
-                        ]
+                        # Determine value for later dataset sorting.
+                        first_value = xrds[
+                            flatten_string_with_groups(time_variable)
+                        ].values.flatten()[0]
+                        # first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0]
                         concat_dim_order.append(first_value)
 
                         benchmark_log["flattening"] = time.time() - start_time