From 8b7a4970d316a084e13d0dcac83d4f37c9d35278 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Tue, 2 Jul 2024 16:45:53 -0400 Subject: [PATCH 1/7] typing and comments --- concatenator/attribute_handling.py | 2 +- concatenator/dataset_and_group_handling.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/concatenator/attribute_handling.py b/concatenator/attribute_handling.py index 83425cd..e0b7177 100644 --- a/concatenator/attribute_handling.py +++ b/concatenator/attribute_handling.py @@ -54,7 +54,7 @@ def regroup_coordinate_attribute(attribute_string: str) -> str: def flatten_coordinate_attribute_paths( dataset: netCDF4.Dataset, var: netCDF4.Variable, variable_name: str ) -> None: - """Flatten the paths of variables referenced in the coordinates attribute.""" + """Flatten the paths of variables referenced in the 'coordinates' attribute.""" if "coordinates" in var.ncattrs(): coord_att = var.getncattr("coordinates") diff --git a/concatenator/dataset_and_group_handling.py b/concatenator/dataset_and_group_handling.py index 2895f1e..8e44a2a 100644 --- a/concatenator/dataset_and_group_handling.py +++ b/concatenator/dataset_and_group_handling.py @@ -8,6 +8,7 @@ from __future__ import annotations import re +from logging import Logger import netCDF4 as nc import numpy as np @@ -273,7 +274,7 @@ def _get_nested_group(dataset: nc.Dataset, group_path: str) -> nc.Group: return nested_group -def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize=4000) -> tuple: +def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize: int = 4000) -> tuple: """ For the given dataset, calculate if the size on any dimension is worth chunking. Any dimension larger than 4000 will be chunked. This @@ -324,8 +325,8 @@ def _get_dimension_size(dataset: nc.Dataset, dim_name: str) -> int: return dim_size -def validate_workable_files(files, logger) -> tuple[list[str], int]: - """Remove files from list that are not open-able as netCDF or that are empty.""" +def validate_workable_files(files: list[str], logger: Logger) -> tuple[list[str], int]: + """Remove files from a list that are not open-able as netCDF or that are empty.""" workable_files = [] for file in files: try: @@ -336,7 +337,7 @@ def validate_workable_files(files, logger) -> tuple[list[str], int]: except OSError: logger.debug("Error opening <%s> as a netCDF dataset. Skipping.", file) - # addressing the issue 153: propagate first empty file if all input files are empty + # addressing GitHub issue 153: propagate the first empty file if all input files are empty if (len(workable_files)) == 0 and (len(files) > 0): workable_files.append(files[0]) From 45a4075deac682829dd0fad663c3042174089816 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Tue, 2 Jul 2024 16:46:27 -0400 Subject: [PATCH 2/7] move function up to conftest.py --- tests/conftest.py | 4 ++++ tests/unit/test_run_stitchee.py | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 1a0a768..b917654 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,10 @@ class TempDirs(typing.NamedTuple): toy_data_path: Path +def path_str(dir_path: Path, filename: str) -> str: + return str(dir_path.joinpath(filename)) + + def pytest_addoption(parser): """Sets up optional argument to keep temporary testing directory.""" parser.addoption( diff --git a/tests/unit/test_run_stitchee.py b/tests/unit/test_run_stitchee.py index b5377c2..11e58d1 100644 --- a/tests/unit/test_run_stitchee.py +++ b/tests/unit/test_run_stitchee.py @@ -7,9 +7,7 @@ import concatenator from concatenator.run_stitchee import parse_args - -def path_str(dir_path: Path, filename: str) -> str: - return str(dir_path.joinpath(filename)) +from ..conftest import path_str def test_parser(): From 57985a3c991073ca2500a94d93b5fb0b36f55e43 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Tue, 2 Jul 2024 16:47:46 -0400 Subject: [PATCH 3/7] make str group flattening into a public function that can handle zero whitespace and add tests --- concatenator/attribute_handling.py | 22 +++++++++++++--------- tests/unit/test_attribute_handling.py | 14 +++++++++++--- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/concatenator/attribute_handling.py b/concatenator/attribute_handling.py index e0b7177..3ec3e46 100644 --- a/concatenator/attribute_handling.py +++ b/concatenator/attribute_handling.py @@ -26,7 +26,7 @@ def regroup_coordinate_attribute(attribute_string: str) -> str: Examples -------- >>> coord_att = "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" - >>> _flatten_coordinate_attribute(coord_att) + >>> flatten_string_with_groups(coord_att) Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude Parameters @@ -58,23 +58,25 @@ def flatten_coordinate_attribute_paths( if "coordinates" in var.ncattrs(): coord_att = var.getncattr("coordinates") - new_coord_att = _flatten_coordinate_attribute(coord_att) + new_coord_att = flatten_string_with_groups(coord_att) dataset.variables[variable_name].setncattr("coordinates", new_coord_att) -def _flatten_coordinate_attribute(attribute_string: str) -> str: - """Converts attributes that specify group membership via "/" to use new group delimiter, even for the root level. +def flatten_string_with_groups(str_with_groups: str) -> str: + """Determine separator and flatten string specifying group membership via "/". + + Applies to variable paths or attributes, even for the root level. Examples -------- >>> coord_att = "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" - >>> _flatten_coordinate_attribute(coord_att) + >>> flatten_string_with_groups(coord_att) __Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude Parameters ---------- - attribute_string : str + str_with_groups : str Returns ------- @@ -82,8 +84,10 @@ def _flatten_coordinate_attribute(attribute_string: str) -> str: """ # Use the separator that's in the attribute string only if all separators in the string are the same. # Otherwise, we will use our own default separator. - whitespaces = re.findall(r"\s+", attribute_string) - if len(set(whitespaces)) <= 1: + whitespaces = re.findall(r"\s+", str_with_groups) + if len(set(whitespaces)) == 0: + new_sep = "" + elif len(set(whitespaces)) == 1: new_sep = whitespaces[0] else: new_sep = concatenator.coord_delim @@ -91,7 +95,7 @@ def _flatten_coordinate_attribute(attribute_string: str) -> str: # A new string is constructed. return new_sep.join( f'{concatenator.group_delim}{c.replace("/", concatenator.group_delim)}' - for c in attribute_string.split() # split on any whitespace + for c in str_with_groups.split() # split on any whitespace ) diff --git a/tests/unit/test_attribute_handling.py b/tests/unit/test_attribute_handling.py index dd2d010..817682a 100644 --- a/tests/unit/test_attribute_handling.py +++ b/tests/unit/test_attribute_handling.py @@ -2,7 +2,7 @@ import concatenator from concatenator.attribute_handling import ( - _flatten_coordinate_attribute, + flatten_string_with_groups, regroup_coordinate_attribute, ) @@ -10,7 +10,7 @@ def test_coordinate_attribute_flattening(): # Case with groups present and double spaces. assert ( - _flatten_coordinate_attribute( + flatten_string_with_groups( "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" ) == "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" @@ -18,13 +18,21 @@ def test_coordinate_attribute_flattening(): # Case with NO groups present and single spaces. assert ( - _flatten_coordinate_attribute( + flatten_string_with_groups( "time longitude latitude ozone_profile_pressure ozone_profile_altitude" ) == "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude" ) +def test_variable_path_flattening(): + # Case with group present. + assert flatten_string_with_groups("geolocation/time") == "__geolocation__time" + + # Case with NO groups present. + assert flatten_string_with_groups("time") == "__time" + + def test_coordinate_attribute_regrouping(): # Case with groups present and double spaces. assert ( From f2d3cce68c078caba96defdff4984d744ddf624f Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Tue, 2 Jul 2024 16:48:12 -0400 Subject: [PATCH 4/7] remove duplicate tests and add test for file validation --- tests/unit/test_group_handling.py | 72 ++++++++++++++----------------- 1 file changed, 32 insertions(+), 40 deletions(-) diff --git a/tests/unit/test_group_handling.py b/tests/unit/test_group_handling.py index 89a51a1..53cb094 100644 --- a/tests/unit/test_group_handling.py +++ b/tests/unit/test_group_handling.py @@ -1,44 +1,36 @@ """Tests for manipulating netCDF groups.""" # pylint: disable=C0116, C0301 +import logging +from pathlib import Path -from concatenator.attribute_handling import ( - _flatten_coordinate_attribute, - regroup_coordinate_attribute, -) - - -def test_coordinate_attribute_flattening(): - # Case with groups present and double spaces. - assert ( - _flatten_coordinate_attribute( - "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" - ) - == "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" - ) - - # Case with NO groups present and single spaces. - assert ( - _flatten_coordinate_attribute( - "time longitude latitude ozone_profile_pressure ozone_profile_altitude" - ) - == "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude" - ) - - -def test_coordinate_attribute_regrouping(): - # Case with groups present and double spaces. - assert ( - regroup_coordinate_attribute( - "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude" - ) - == "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude" - ) - - # Case with NO groups present and single spaces. - assert ( - regroup_coordinate_attribute( - "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude" - ) - == "time longitude latitude ozone_profile_pressure ozone_profile_altitude" - ) +import pytest + +from concatenator.dataset_and_group_handling import validate_workable_files + +from ..conftest import path_str + +logger = logging.getLogger(__name__) + + +@pytest.mark.usefixtures("pass_options") +class TestGroupHandling: + __test_path = Path(__file__).parents[1].resolve() + __data_path = __test_path.joinpath("data") + __harmony_path = __data_path.joinpath("harmony") + __granules_path = __harmony_path.joinpath("granules") + + def test_workable_files_validation(self, temp_output_dir): + filepaths = [ + path_str( + self.__granules_path, "TEMPO_NO2_L2_V03_20240601T210934Z_S012G01_subsetted.nc4" + ), + path_str( + self.__granules_path, "TEMPO_NO2_L2_V03_20240601T211614Z_S012G02_subsetted.nc4" + ), + path_str( + self.__granules_path, "TEMPO_NO2_L2_V03_20240601T212254Z_S012G03_subsetted.nc4" + ), + ] + + assert validate_workable_files(filepaths, logger)[1] == 3 From 9f2fefb4c1a3458ba542024b212c26f41ace3af8 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Tue, 2 Jul 2024 16:49:39 -0400 Subject: [PATCH 5/7] add new 'time_variable' to stitchee function signature for ordering datasets --- concatenator/stitchee.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/concatenator/stitchee.py b/concatenator/stitchee.py index fb71617..d20c822 100644 --- a/concatenator/stitchee.py +++ b/concatenator/stitchee.py @@ -15,6 +15,7 @@ import xarray as xr import concatenator +from concatenator.attribute_handling import flatten_string_with_groups from concatenator.dataset_and_group_handling import ( flatten_grouped_dataset, regroup_flattened_dataset, @@ -39,6 +40,7 @@ def stitchee( concat_method: str | None = "xarray-concat", concat_dim: str = "", concat_kwargs: dict | None = None, + time_variable: str = "geolocation/time", history_to_append: str | None = None, copy_input_files: bool = False, overwrite_output_file: bool = False, @@ -136,7 +138,10 @@ def stitchee( decode_coords=False, drop_variables=coord_vars, ) - first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0] + + # Determine value for later dataset sorting. + first_value = xrds[flatten_string_with_groups(time_variable)].values.flatten()[0] + # first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0] concat_dim_order.append(first_value) benchmark_log["flattening"] = time.time() - start_time From 79a19796601252fedab8045e2714e1a5dc30a927 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Tue, 2 Jul 2024 17:15:00 -0400 Subject: [PATCH 6/7] update history construction test to use new time_variable arg --- tests/integration/test_history_construction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_history_construction.py b/tests/integration/test_history_construction.py index 064f033..034e6de 100644 --- a/tests/integration/test_history_construction.py +++ b/tests/integration/test_history_construction.py @@ -28,6 +28,7 @@ def test_construct_and_append_history_for_sample_concatenation( concat_method="xarray-concat", history_to_append=new_history_json, concat_dim="step", + time_variable="step", ) stitcheed_dataset = xr.open_dataset(output_path) From 391e4e024a2c7f60af81d85b7618c6b66a7dc519 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Tue, 2 Jul 2024 17:19:54 -0400 Subject: [PATCH 7/7] update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae4a52..316f78d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - [Issue #206](https://github.com/nasa/stitchee/issues/206): Group dependabot updates into one PR - [issue #208](https://github.com/nasa/stitchee/issues/208): Increase continuous integration/unit test coverage +- [issue #198](https://github.com/nasa/stitchee/issues/198): Use time variable instead of concat dim for ordering datasets ### Deprecated ### Removed ### Fixed