From dcb26f56acd0f1dbb9104f590955dd7104b22db8 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 17 Dec 2024 16:14:07 +1100 Subject: [PATCH 1/2] fix: Handle none values --- packages/ref/src/ref/datasets/cmip6.py | 15 +++++----- .../ref/tests/unit/datasets/test_cmip6.py | 28 ++++++++++++++++++- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/packages/ref/src/ref/datasets/cmip6.py b/packages/ref/src/ref/datasets/cmip6.py index 433639e..90a6e94 100644 --- a/packages/ref/src/ref/datasets/cmip6.py +++ b/packages/ref/src/ref/datasets/cmip6.py @@ -52,17 +52,18 @@ def _fix_parent_variant_label(group: pd.DataFrame) -> pd.DataFrame: data_catalog = data_catalog.groupby("instance_id").apply(_fix_parent_variant_label).reset_index(drop=True) - # EC-Earth3 uses "D" as a suffix for the branch_time_in_child and branch_time_in_parent columns - data_catalog["branch_time_in_child"] = pd.to_numeric( - data_catalog["branch_time_in_child"].astype(str).str.replace("D", ""), errors="raise" - ) - data_catalog["branch_time_in_parent"] = pd.to_numeric( - data_catalog["branch_time_in_parent"].astype(str).str.replace("D", ""), errors="raise" - ) + data_catalog["branch_time_in_child"] = _clean_branch_time(data_catalog["branch_time_in_child"]) + data_catalog["branch_time_in_parent"] = _clean_branch_time(data_catalog["branch_time_in_parent"]) return data_catalog +def _clean_branch_time(branch_time: pd.Series[str]) -> pd.Series[float]: + # EC-Earth3 uses "D" as a suffix for the branch_time_in_child and branch_time_in_parent columns + # Handle missing values (these result in nan values) + return pd.to_numeric(branch_time.astype(str).str.replace("D", "").replace("None", ""), errors="raise") + + class CMIP6DatasetAdapter(DatasetAdapter): """ Adapter for CMIP6 datasets diff --git a/packages/ref/tests/unit/datasets/test_cmip6.py b/packages/ref/tests/unit/datasets/test_cmip6.py index 5899a5d..ba5583b 100644 --- a/packages/ref/tests/unit/datasets/test_cmip6.py +++ b/packages/ref/tests/unit/datasets/test_cmip6.py @@ -1,9 +1,10 @@ import datetime +import numpy as np import pandas as pd import pytest -from ref.datasets.cmip6 import CMIP6DatasetAdapter, _parse_datetime +from ref.datasets.cmip6 import CMIP6DatasetAdapter, _apply_fixes, _parse_datetime @pytest.fixture @@ -76,3 +77,28 @@ def test_load_local_datasets(self, esgf_data_dir, catalog_regression): catalog_regression( data_catalog.sort_values(["instance_id", "start_time"]), basename="cmip6_catalog_local" ) + + +def test_apply_fixes(): + df = pd.DataFrame( + { + "instance_id": ["dataset_001", "dataset_001", "dataset_002"], + "parent_variant_label": ["r1i1p1f1", "r1i1p1f2", "r1i1p1f2"], + "variant_label": ["r1i1p1f1", "r1i1p1f1", "r1i1p1f2"], + "branch_time_in_child": ["0D", "12", "12.0"], + "branch_time_in_parent": [None, "12", "12.0"], + } + ) + + res = _apply_fixes(df) + + exp = pd.DataFrame( + { + "instance_id": ["dataset_001", "dataset_001", "dataset_002"], + "parent_variant_label": ["r1i1p1f1", "r1i1p1f1", "r1i1p1f2"], + "variant_label": ["r1i1p1f1", "r1i1p1f1", "r1i1p1f2"], + "branch_time_in_child": [0.0, 12.0, 12.0], + "branch_time_in_parent": [np.nan, 12.0, 12.0], + } + ) + pd.testing.assert_frame_equal(res, exp) From 508c1f46dca9c2e15c0622a387316d40d076684c Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 17 Dec 2024 16:16:04 +1100 Subject: [PATCH 2/2] docs: Changelog --- changelog/42.fix.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changelog/42.fix.md diff --git a/changelog/42.fix.md b/changelog/42.fix.md new file mode 100644 index 0000000..137575f --- /dev/null +++ b/changelog/42.fix.md @@ -0,0 +1,2 @@ +Handle missing branch times. +Fixes [#38](https://github.com/CMIP-REF/cmip-ref/issues/38).