Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issue 198 ordered output #221

Merged
merged 7 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- [Issue #206](https://github.com/nasa/stitchee/issues/206): Group dependabot updates into one PR
- [issue #208](https://github.com/nasa/stitchee/issues/208): Increase continuous integration/unit test coverage
- [issue #198](https://github.com/nasa/stitchee/issues/198): Use time variable instead of concat dim for ordering datasets
### Deprecated
### Removed
### Fixed
Expand Down
24 changes: 14 additions & 10 deletions concatenator/attribute_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
Examples
--------
>>> coord_att = "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
>>> _flatten_coordinate_attribute(coord_att)
>>> flatten_string_with_groups(coord_att)
Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude

Parameters
Expand Down Expand Up @@ -54,44 +54,48 @@ def regroup_coordinate_attribute(attribute_string: str) -> str:
def flatten_coordinate_attribute_paths(
dataset: netCDF4.Dataset, var: netCDF4.Variable, variable_name: str
) -> None:
"""Flatten the paths of variables referenced in the coordinates attribute."""
"""Flatten the paths of variables referenced in the 'coordinates' attribute."""
if "coordinates" in var.ncattrs():
coord_att = var.getncattr("coordinates")

new_coord_att = _flatten_coordinate_attribute(coord_att)
new_coord_att = flatten_string_with_groups(coord_att)

dataset.variables[variable_name].setncattr("coordinates", new_coord_att)


def _flatten_coordinate_attribute(attribute_string: str) -> str:
"""Converts attributes that specify group membership via "/" to use new group delimiter, even for the root level.
def flatten_string_with_groups(str_with_groups: str) -> str:
"""Determine separator and flatten string specifying group membership via "/".

Applies to variable paths or attributes, even for the root level.

Examples
--------
>>> coord_att = "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
>>> _flatten_coordinate_attribute(coord_att)
>>> flatten_string_with_groups(coord_att)
__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude

Parameters
----------
attribute_string : str
str_with_groups : str

Returns
-------
str
"""
# Use the separator that's in the attribute string only if all separators in the string are the same.
# Otherwise, we will use our own default separator.
whitespaces = re.findall(r"\s+", attribute_string)
if len(set(whitespaces)) <= 1:
whitespaces = re.findall(r"\s+", str_with_groups)
if len(set(whitespaces)) == 0:
new_sep = ""
elif len(set(whitespaces)) == 1:
new_sep = whitespaces[0]
else:
new_sep = concatenator.coord_delim

# A new string is constructed.
return new_sep.join(
f'{concatenator.group_delim}{c.replace("/", concatenator.group_delim)}'
for c in attribute_string.split() # split on any whitespace
for c in str_with_groups.split() # split on any whitespace
)


Expand Down
9 changes: 5 additions & 4 deletions concatenator/dataset_and_group_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from __future__ import annotations

import re
from logging import Logger

import netCDF4 as nc
import numpy as np
Expand Down Expand Up @@ -273,7 +274,7 @@ def _get_nested_group(dataset: nc.Dataset, group_path: str) -> nc.Group:
return nested_group


def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize=4000) -> tuple:
def _calculate_chunks(dim_sizes: list, default_low_dim_chunksize: int = 4000) -> tuple:
"""
For the given dataset, calculate if the size on any dimension is
worth chunking. Any dimension larger than 4000 will be chunked. This
Expand Down Expand Up @@ -324,8 +325,8 @@ def _get_dimension_size(dataset: nc.Dataset, dim_name: str) -> int:
return dim_size


def validate_workable_files(files, logger) -> tuple[list[str], int]:
"""Remove files from list that are not open-able as netCDF or that are empty."""
def validate_workable_files(files: list[str], logger: Logger) -> tuple[list[str], int]:
"""Remove files from a list that are not open-able as netCDF or that are empty."""
workable_files = []
for file in files:
try:
Expand All @@ -336,7 +337,7 @@ def validate_workable_files(files, logger) -> tuple[list[str], int]:
except OSError:
logger.debug("Error opening <%s> as a netCDF dataset. Skipping.", file)

# addressing the issue 153: propagate first empty file if all input files are empty
# addressing GitHub issue 153: propagate the first empty file if all input files are empty
if (len(workable_files)) == 0 and (len(files) > 0):
workable_files.append(files[0])

Expand Down
7 changes: 6 additions & 1 deletion concatenator/stitchee.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import xarray as xr

import concatenator
from concatenator.attribute_handling import flatten_string_with_groups
from concatenator.dataset_and_group_handling import (
flatten_grouped_dataset,
regroup_flattened_dataset,
Expand All @@ -39,6 +40,7 @@ def stitchee(
concat_method: str | None = "xarray-concat",
concat_dim: str = "",
concat_kwargs: dict | None = None,
time_variable: str = "geolocation/time",
history_to_append: str | None = None,
copy_input_files: bool = False,
overwrite_output_file: bool = False,
Expand Down Expand Up @@ -136,7 +138,10 @@ def stitchee(
decode_coords=False,
drop_variables=coord_vars,
)
first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0]

# Determine value for later dataset sorting.
first_value = xrds[flatten_string_with_groups(time_variable)].values.flatten()[0]
# first_value = xrds[concatenator.group_delim + concat_dim].values.flatten()[0]
concat_dim_order.append(first_value)

benchmark_log["flattening"] = time.time() - start_time
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ class TempDirs(typing.NamedTuple):
toy_data_path: Path


def path_str(dir_path: Path, filename: str) -> str:
return str(dir_path.joinpath(filename))


def pytest_addoption(parser):
"""Sets up optional argument to keep temporary testing directory."""
parser.addoption(
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_history_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def test_construct_and_append_history_for_sample_concatenation(
concat_method="xarray-concat",
history_to_append=new_history_json,
concat_dim="step",
time_variable="step",
)
stitcheed_dataset = xr.open_dataset(output_path)

Expand Down
14 changes: 11 additions & 3 deletions tests/unit/test_attribute_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,37 @@

import concatenator
from concatenator.attribute_handling import (
_flatten_coordinate_attribute,
flatten_string_with_groups,
regroup_coordinate_attribute,
)


def test_coordinate_attribute_flattening():
# Case with groups present and double spaces.
assert (
_flatten_coordinate_attribute(
flatten_string_with_groups(
"Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
)
== "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
)

# Case with NO groups present and single spaces.
assert (
_flatten_coordinate_attribute(
flatten_string_with_groups(
"time longitude latitude ozone_profile_pressure ozone_profile_altitude"
)
== "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
)


def test_variable_path_flattening():
# Case with group present.
assert flatten_string_with_groups("geolocation/time") == "__geolocation__time"

# Case with NO groups present.
assert flatten_string_with_groups("time") == "__time"


def test_coordinate_attribute_regrouping():
# Case with groups present and double spaces.
assert (
Expand Down
72 changes: 32 additions & 40 deletions tests/unit/test_group_handling.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,36 @@
"""Tests for manipulating netCDF groups."""

# pylint: disable=C0116, C0301
import logging
from pathlib import Path

from concatenator.attribute_handling import (
_flatten_coordinate_attribute,
regroup_coordinate_attribute,
)


def test_coordinate_attribute_flattening():
# Case with groups present and double spaces.
assert (
_flatten_coordinate_attribute(
"Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
)
== "__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
)

# Case with NO groups present and single spaces.
assert (
_flatten_coordinate_attribute(
"time longitude latitude ozone_profile_pressure ozone_profile_altitude"
)
== "__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
)


def test_coordinate_attribute_regrouping():
# Case with groups present and double spaces.
assert (
regroup_coordinate_attribute(
"__Time_and_Position__time __Time_and_Position__instrument_fov_latitude __Time_and_Position__instrument_fov_longitude"
)
== "Time_and_Position/time Time_and_Position/instrument_fov_latitude Time_and_Position/instrument_fov_longitude"
)

# Case with NO groups present and single spaces.
assert (
regroup_coordinate_attribute(
"__time __longitude __latitude __ozone_profile_pressure __ozone_profile_altitude"
)
== "time longitude latitude ozone_profile_pressure ozone_profile_altitude"
)
import pytest

from concatenator.dataset_and_group_handling import validate_workable_files

from ..conftest import path_str

logger = logging.getLogger(__name__)


@pytest.mark.usefixtures("pass_options")
class TestGroupHandling:
__test_path = Path(__file__).parents[1].resolve()
__data_path = __test_path.joinpath("data")
__harmony_path = __data_path.joinpath("harmony")
__granules_path = __harmony_path.joinpath("granules")

def test_workable_files_validation(self, temp_output_dir):
filepaths = [
path_str(
self.__granules_path, "TEMPO_NO2_L2_V03_20240601T210934Z_S012G01_subsetted.nc4"
),
path_str(
self.__granules_path, "TEMPO_NO2_L2_V03_20240601T211614Z_S012G02_subsetted.nc4"
),
path_str(
self.__granules_path, "TEMPO_NO2_L2_V03_20240601T212254Z_S012G03_subsetted.nc4"
),
]

assert validate_workable_files(filepaths, logger)[1] == 3
4 changes: 1 addition & 3 deletions tests/unit/test_run_stitchee.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
import concatenator
from concatenator.run_stitchee import parse_args


def path_str(dir_path: Path, filename: str) -> str:
return str(dir_path.joinpath(filename))
from ..conftest import path_str


def test_parser():
Expand Down