From 8b7a4970d316a084e13d0dcac83d4f37c9d35278 Mon Sep 17 00:00:00 2001
From: danielfromearth <daniel.kaufman@nasa.gov>
Date: Tue, 2 Jul 2024 16:45:53 -0400
Subject: [PATCH 1/7] typing and comments

---
 concatenator/attribute_handling.py         | 2 +-
 concatenator/dataset_and_group_handling.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/concatenator/attribute_handling.py b/concatenator/attribute_handling.py
index 83425cd..e0b7177 100644
--- a/concatenator/attribute_handling.py
+++ b/concatenator/attribute_handling.py
@@ -54,7 +54,7 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
 def flatten_coordinate_attribute_paths(
     dataset: netCDF4.Dataset, var: netCDF4.Variable, variable_name: str
 ) -> None:
-    """Flatten the paths of variables referenced in the coordinates attribute."""
+    """Flatten the paths of variables referenced in the 'coordinates' attribute."""
     if "coordinates" in var.ncattrs():
         coord_att = var.getncattr("coordinates")
 
diff --git a/concatenator/dataset_and_group_handling.py b/concatenator/dataset_and_group_handling.py
index 2895f1e..8e44a2a 100644
--- a/concatenator/dataset_and_group_handling.py
+++ b/concatenator/dataset_and_group_handling.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import re
+from logging import Logger
 
 import netCDF4 as nc
 import numpy as np
@@ -273,7 +274,7 @@ def _get_nested_group(dataset: nc.Dataset, group_path: str) -> nc.Group:
     return nested_group
 
 
-def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize=4000) -> tuple:
+def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize: int = 4000) -> tuple:
     """
     For the given dataset, calculate if the size on any dimension is
     worth chunking. Any dimension larger than 4000 will be chunked. This
@@ -324,8 +325,8 @@ def _get_dimension_size(dataset: nc.Dataset, dim_name: str) -> int:
     return dim_size
 
 
-def validate_workable_files(files, logger) -> tuple[list[str], int]:
-    """Remove files from list that are not open-able as netCDF or that are empty."""
+def validate_workable_files(files: list[str], logger: Logger) -> tuple[list[str], int]:
+    """Remove files from a list that are not open-able as netCDF or that are empty."""
     workable_files = []
     for file in files:
         try:
@@ -336,7 +337,7 @@ def validate_workable_files(files, logger) -> tuple[list[str], int]:
         except OSError:
             logger.debug("Error opening <%s> as a netCDF dataset. Skipping.", file)
 
-    # addressing the issue 153: propagate first empty file if all input files are empty
+    # addressing GitHub issue 153: propagate the first empty file if all input files are empty
     if (len(workable_files)) == 0 and (len(files) > 0):
         workable_files.append(files[0])
 

From 45a4075deac682829dd0fad663c3042174089816 Mon Sep 17 00:00:00 2001
From: danielfromearth <daniel.kaufman@nasa.gov>
Date: Tue, 2 Jul 2024 16:46:27 -0400
Subject: [PATCH 2/7] move function up to conftest.py

---
 tests/conftest.py               | 4 ++++
 tests/unit/test_run_stitchee.py | 4 +---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 1a0a768..b917654 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -19,6 +19,10 @@ class TempDirs(typing.NamedTuple):
     toy_data_path: Path
 
 
+def path_str(dir_path: Path, filename: str) -> str:
+    return str(dir_path.joinpath(filename))
+
+
 def pytest_addoption(parser):
     """Sets up optional argument to keep temporary testing directory."""
     parser.addoption(
diff --git a/tests/unit/test_run_stitchee.py b/tests/unit/test_run_stitchee.py
index b5377c2..11e58d1 100644
--- a/tests/unit/test_run_stitchee.py
+++ b/tests/unit/test_run_stitchee.py
@@ -7,9 +7,7 @@
 import concatenator
 from concatenator.run_stitchee import parse_args
 
-
-def path_str(dir_path: Path, filename: str) -> str:
-    return str(dir_path.joinpath(filename))
+from ..conftest import path_str
 
 
 def test_parser():

From 57985a3c991073ca2500a94d93b5fb0b36f55e43 Mon Sep 17 00:00:00 2001
From: danielfromearth <daniel.kaufman@nasa.gov>
Date: Tue, 2 Jul 2024 16:47:46 -0400
Subject: [PATCH 3/7] make str group flattening into a public function that can
 handle zero whitespace and add tests

---
 concatenator/attribute_handling.py    | 22 +++++++++++++---------
 tests/unit/test_attribute_handling.py | 14 +++++++++++---
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/concatenator/attribute_handling.py b/concatenator/attribute_handling.py
index e0b7177..3ec3e46 100644
--- a/concatenator/attribute_handling.py
+++ b/concatenator/attribute_handling.py
@@ -26,7 +26,7 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
     Examples
     --------
     >>> coord_att = "__Time_and_Position__time  __Time_and_Position__instrument_fov_latitude  __Time_and_Position__instrument_fov_longitude"
-    >>> _flatten_coordinate_attribute(coord_att)
+    >>> flatten_string_with_groups(coord_att)
         Time_and_Position/time  Time_and_Position/instrument_fov_latitude  Time_and_Position/instrument_fov_longitude
 
     Parameters
@@ -58,23 +58,25 @@ def flatten_coordinate_attribute_paths(
     if "coordinates" in var.ncattrs():
         coord_att = var.getncattr("coordinates")
 
-        new_coord_att = _flatten_coordinate_attribute(coord_att)
+        new_coord_att = flatten_string_with_groups(coord_att)
 
         dataset.variables[variable_name].setncattr("coordinates", new_coord_att)
 
 
-def _flatten_coordinate_attribute(attribute_string: str) -> str:
-    """Converts attributes that specify group membership via "/" to use new group delimiter, even for the root level.
+def flatten_string_with_groups(str_with_groups: str) -> str:
+    """Determine separator and flatten string specifying group membership via "/".
+
+    Applies to variable paths or attributes, even for the root level.
 
     Examples
     --------
     >>> coord_att = "Time_and_Position/time  Time_and_Position/instrument_fov_latitude  Time_and_Position/instrument_fov_longitude"
-    >>> _flatten_coordinate_attribute(coord_att)
+    >>> flatten_string_with_groups(coord_att)
         __Time_and_Position__time  __Time_and_Position__instrument_fov_latitude  __Time_and_Position__instrument_fov_longitude
 
     Parameters
     ----------
-    attribute_string : str
+    str_with_groups : str
 
     Returns
     -------
@@ -82,8 +84,10 @@ def _flatten_coordinate_attribute(attribute_string: str) -> str:
     """
     # Use the separator that's in the attribute string only if all separators in the string are the same.
     # Otherwise, we will use our own default separator.
-    whitespaces = re.findall(r"\s+", attribute_string)
-    if len(set(whitespaces)) <= 1:
+    whitespaces = re.findall(r"\s+", str_with_groups)
+    if len(set(whitespaces)) == 0:
+        new_sep = ""
+    elif len(set(whitespaces)) == 1:
         new_sep = whitespaces[0]
     else:
         new_sep = concatenator.coord_delim
@@ -91,7 +95,7 @@ def _flatten_coordinate_attribute(attribute_string: str) -> str:
     # A new string is constructed.
     return new_sep.join(
         f'{concatenator.group_delim}{c.replace("/", concatenator.group_delim)}'
-        for c in attribute_string.split()  # split on any whitespace
+        for c in str_with_groups.split()  # split on any whitespace
     )
 
 
diff --git a/tests/unit/test_attribute_handling.py b/tests/unit/test_attribute_handling.py
index dd2d010..817682a 100644
--- a/tests/unit/test_attribute_handling.py
+++ b/tests/unit/test_attribute_handling.py
@@ -2,7 +2,7 @@
 
 import concatenator
 from concatenator.attribute_handling import (
-    _flatten_coordinate_attribute,
+    flatten_string_with_groups,
     regroup_coordinate_attribute,
 )
 
@@ -10,7 +10,7 @@
 def test_coordinate_attribute_flattening():
     # Case with groups present and double spaces.
     assert (
-        _flatten_coordinate_attribute(
+        flatten_string_with_groups(
             "Time_and_Position/time  Time_and_Position/instrument_fov_latitude  Time_and_Position/instrument_fov_longitude"
         )
         == "__Time_and_Position__time  __Time_and_Position__instrument_fov_latitude  __Time_and_Position__instrument_fov_longitude"
@@ -18,13 +18,21 @@ def test_coordinate_attribute_flattening():
 
     # Case with NO groups present and single spaces.
     assert (
-        _flatten_coordinate_attribute(
+        flatten_string_with_groups(
             "time longitude latitude ozone_profile_pressure ozone_profile_altitude"
         )
         == "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
     )
 
 
+def test_variable_path_flattening():
+    # Case with group present.
+    assert flatten_string_with_groups("geolocation/time") == "__geolocation__time"
+
+    # Case with NO groups present.
+    assert flatten_string_with_groups("time") == "__time"
+
+
 def test_coordinate_attribute_regrouping():
     # Case with groups present and double spaces.
     assert (

From f2d3cce68c078caba96defdff4984d744ddf624f Mon Sep 17 00:00:00 2001
From: danielfromearth <daniel.kaufman@nasa.gov>
Date: Tue, 2 Jul 2024 16:48:12 -0400
Subject: [PATCH 4/7] remove duplicate tests and add test for file validation

---
 tests/unit/test_group_handling.py | 72 ++++++++++++++-----------------
 1 file changed, 32 insertions(+), 40 deletions(-)

diff --git a/tests/unit/test_group_handling.py b/tests/unit/test_group_handling.py
index 89a51a1..53cb094 100644
--- a/tests/unit/test_group_handling.py
+++ b/tests/unit/test_group_handling.py
@@ -1,44 +1,36 @@
 """Tests for manipulating netCDF groups."""
 
 # pylint: disable=C0116, C0301
+import logging
+from pathlib import Path
 
-from concatenator.attribute_handling import (
-    _flatten_coordinate_attribute,
-    regroup_coordinate_attribute,
-)
-
-
-def test_coordinate_attribute_flattening():
-    # Case with groups present and double spaces.
-    assert (
-        _flatten_coordinate_attribute(
-            "Time_and_Position/time  Time_and_Position/instrument_fov_latitude  Time_and_Position/instrument_fov_longitude"
-        )
-        == "__Time_and_Position__time  __Time_and_Position__instrument_fov_latitude  __Time_and_Position__instrument_fov_longitude"
-    )
-
-    # Case with NO groups present and single spaces.
-    assert (
-        _flatten_coordinate_attribute(
-            "time longitude latitude ozone_profile_pressure ozone_profile_altitude"
-        )
-        == "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
-    )
-
-
-def test_coordinate_attribute_regrouping():
-    # Case with groups present and double spaces.
-    assert (
-        regroup_coordinate_attribute(
-            "__Time_and_Position__time  __Time_and_Position__instrument_fov_latitude  __Time_and_Position__instrument_fov_longitude"
-        )
-        == "Time_and_Position/time  Time_and_Position/instrument_fov_latitude  Time_and_Position/instrument_fov_longitude"
-    )
-
-    # Case with NO groups present and single spaces.
-    assert (
-        regroup_coordinate_attribute(
-            "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
-        )
-        == "time longitude latitude ozone_profile_pressure ozone_profile_altitude"
-    )
+import pytest
+
+from concatenator.dataset_and_group_handling import validate_workable_files
+
+from ..conftest import path_str
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.usefixtures("pass_options")
+class TestGroupHandling:
+    __test_path = Path(__file__).parents[1].resolve()
+    __data_path = __test_path.joinpath("data")
+    __harmony_path = __data_path.joinpath("harmony")
+    __granules_path = __harmony_path.joinpath("granules")
+
+    def test_workable_files_validation(self, temp_output_dir):
+        filepaths = [
+            path_str(
+                self.__granules_path, "TEMPO_NO2_L2_V03_20240601T210934Z_S012G01_subsetted.nc4"
+            ),
+            path_str(
+                self.__granules_path, "TEMPO_NO2_L2_V03_20240601T211614Z_S012G02_subsetted.nc4"
+            ),
+            path_str(
+                self.__granules_path, "TEMPO_NO2_L2_V03_20240601T212254Z_S012G03_subsetted.nc4"
+            ),
+        ]
+
+        assert validate_workable_files(filepaths, logger)[1] == 3

From 9f2fefb4c1a3458ba542024b212c26f41ace3af8 Mon Sep 17 00:00:00 2001
From: danielfromearth <daniel.kaufman@nasa.gov>
Date: Tue, 2 Jul 2024 16:49:39 -0400
Subject: [PATCH 5/7] add new 'time_variable' to stitchee function signature
 for ordering datasets

---
 concatenator/stitchee.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/concatenator/stitchee.py b/concatenator/stitchee.py
index fb71617..d20c822 100644
--- a/concatenator/stitchee.py
+++ b/concatenator/stitchee.py
@@ -15,6 +15,7 @@
 import xarray as xr
 
 import concatenator
+from concatenator.attribute_handling import flatten_string_with_groups
 from concatenator.dataset_and_group_handling import (
     flatten_grouped_dataset,
     regroup_flattened_dataset,
@@ -39,6 +40,7 @@ def stitchee(
     concat_method: str | None = "xarray-concat",
     concat_dim: str = "",
     concat_kwargs: dict | None = None,
+    time_variable: str = "geolocation/time",
     history_to_append: str | None = None,
     copy_input_files: bool = False,
     overwrite_output_file: bool = False,
@@ -136,7 +138,10 @@ def stitchee(
                     decode_coords=False,
                     drop_variables=coord_vars,
                 )
-                first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0]
+
+                # Determine value for later dataset sorting.
+                first_value = xrds[flatten_string_with_groups(time_variable)].values.flatten()[0]
+                # first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0]
                 concat_dim_order.append(first_value)
 
                 benchmark_log["flattening"] = time.time() - start_time

From 79a19796601252fedab8045e2714e1a5dc30a927 Mon Sep 17 00:00:00 2001
From: danielfromearth <daniel.kaufman@nasa.gov>
Date: Tue, 2 Jul 2024 17:15:00 -0400
Subject: [PATCH 6/7] update history construction test to use new time_variable
 arg

---
 tests/integration/test_history_construction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_history_construction.py b/tests/integration/test_history_construction.py
index 064f033..034e6de 100644
--- a/tests/integration/test_history_construction.py
+++ b/tests/integration/test_history_construction.py
@@ -28,6 +28,7 @@ def test_construct_and_append_history_for_sample_concatenation(
         concat_method="xarray-concat",
         history_to_append=new_history_json,
         concat_dim="step",
+        time_variable="step",
     )
     stitcheed_dataset = xr.open_dataset(output_path)
 

From 391e4e024a2c7f60af81d85b7618c6b66a7dc519 Mon Sep 17 00:00:00 2001
From: danielfromearth <daniel.kaufman@nasa.gov>
Date: Tue, 2 Jul 2024 17:19:54 -0400
Subject: [PATCH 7/7] update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ae4a52..316f78d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 - [Issue #206](https://github.com/nasa/stitchee/issues/206): Group dependabot updates into one PR
 - [issue #208](https://github.com/nasa/stitchee/issues/208): Increase continuous integration/unit test coverage
+- [issue #198](https://github.com/nasa/stitchee/issues/198): Use time variable instead of concat dim for ordering datasets
 ### Deprecated
 ### Removed
 ### Fixed