From c7aee0a4e3a4493b0166ed6c88d0c81e02872773 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 3 Apr 2024 16:50:09 -0400 Subject: [PATCH 1/3] NA value --- src/nested_pandas/series/na.py | 55 +++++++++++++++++++++++++++ tests/nested_pandas/series/test_na.py | 50 ++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 src/nested_pandas/series/na.py create mode 100644 tests/nested_pandas/series/test_na.py diff --git a/src/nested_pandas/series/na.py b/src/nested_pandas/series/na.py new file mode 100644 index 0000000..0b77bb4 --- /dev/null +++ b/src/nested_pandas/series/na.py @@ -0,0 +1,55 @@ +"""Missing value for NestedDtype + +It i something between pandas' NA and NaN +""" + +__all__ = ["NAType", "NA"] + + +class _NAType: + pass + + +class NAType: + """Singleton class representing missing value for NestedDtype. + + It doesn't implement most of the arithmetics and boolean logic operations, + because they are ambiguous for missing values. + + The implementation is inspired both by pandas' NA and float number NaN. + + `NA` is a singleton instance of this class. + """ + + _instance = None + + def __new__(cls, *args, **kwargs): + """Create a new instance of NAType.""" + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __repr__(self) -> str: + return "" + + def __format__(self, format_spec) -> str: + try: + return self.__repr__().__format__(format_spec) + except ValueError: + return self.__repr__() + + def __bool__(self): + raise TypeError("boolean value of NA is ambiguous") + + def __eq__(self, other): + return False + + def __ne__(self, other): + return True + + def __hash__(self): + return 0 + + +NA = NAType() +"""Missed value for NestedDtype, a singleton instance of `NAType` class.""" diff --git a/tests/nested_pandas/series/test_na.py b/tests/nested_pandas/series/test_na.py new file mode 100644 index 0000000..b3a81b3 --- /dev/null +++ b/tests/nested_pandas/series/test_na.py @@ -0,0 +1,50 @@ +import pytest +from nested_pandas.series.na import NA + + +def test_na_is_singleton(): + """Test that NA is a singleton instance""" + assert NA is NA + + +def test_na_repr(): + """Test that NA has the correct representation.""" + assert repr(NA) == "" + + +def test_na_format(): + """Test that NA has the correct format.""" + assert f"{NA}" == "" + + +def test_na_bool(): + """Test that NA raises TypeError when converted to bool.""" + with pytest.raises(TypeError): + bool(NA) + + +def test_na_eq(): + """Test that NA is not equal to anything.""" + assert NA != 1 + assert NA != 1.0 + assert NA != "1" + assert NA != NA + + +def test_na_neq(): + """Test that NA is not equal to anything.""" + assert NA != 1 + assert NA != 1.0 + assert NA != "1" + assert [] != NA + assert {} != NA + assert NA != () + assert set() != NA + assert NA != NA + assert object() != NA + + +def test_hash(): + """Test that hash(NA) is always the same.""" + assert hash(NA) == hash(NA) + assert {NA, NA} == {NA} From 11a59dbba269c4d30950e0c1ac97be0b79088422 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 3 Apr 2024 16:56:06 -0400 Subject: [PATCH 2/3] Reimplementation of NestedDtype from scratch It doesn't work right now, probably we need to re-implement extension array --- src/nested_pandas/series/dtype.py | 183 +++++++++++++++-------- tests/nested_pandas/series/test_dtype.py | 32 ++++ 2 files changed, 152 insertions(+), 63 deletions(-) diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index 7dd8c95..0d85409 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -2,73 +2,52 @@ from __future__ import annotations from collections.abc import Mapping -from typing import cast + +# We use Type, because we must use "type" as an attribute name +from typing import Type, cast # noqa: UP035 import pandas as pd import pyarrow as pa from pandas import ArrowDtype from pandas.api.extensions import register_extension_dtype -from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.base import ExtensionDtype +from nested_pandas.series.na import NA, NAType from nested_pandas.series.utils import is_pa_type_a_list __all__ = ["NestedDtype"] @register_extension_dtype -class NestedDtype(ArrowDtype): +class NestedDtype(ExtensionDtype): """Data type to handle packed time series data""" - pyarrow_dtype: pa.StructType + # ExtensionDtype overrides # - def __init__(self, pyarrow_dtype: pa.DataType) -> None: - pyarrow_dtype = self._validate_dtype(pyarrow_dtype) - super().__init__(pyarrow_dtype=pyarrow_dtype) + _metadata = ("pyarrow_dtype",) + """Attributes to use as metadata for __eq__ and __hash__""" - @classmethod - def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 - """Make NestedDtype from a mapping of field names and list item types. - - Parameters - ---------- - fields : Mapping[str, pa.DataType] - A mapping of field names and their item types. Since all fields are lists, the item types are - inner types of the lists, not the list types themselves. + @property + def na_value(self) -> NAType: + """The missing value for this dtype""" + return NA - Returns - ------- - NestedDtype - The constructed NestedDtype. + type = pd.DataFrame + """The type of the array's elements, always pd.DataFrame""" - Examples - -------- - >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()}) - >>> dtype - nested - >>> assert ( - ... dtype.pyarrow_dtype - ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())}) - ... ) - """ - pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()}) - pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) - return cls(pyarrow_dtype=pyarrow_dtype) + @property + def name(self) -> str: + """The string representation of the nested type""" + fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype]) + return f"nested<{fields}>" - @staticmethod - def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType: - if not isinstance(pyarrow_dtype, pa.DataType): - raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}") - if not pa.types.is_struct(pyarrow_dtype): - raise ValueError("NestedDtype can only be constructed with pyarrow struct type.") - pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) + @classmethod + def construct_array_type(cls) -> Type[ExtensionArray]: + """Corresponded array type, always NestedExtensionArray""" + from nested_pandas.series.ext_array import NestedExtensionArray - for field in pyarrow_dtype: - if not is_pa_type_a_list(field.type): - raise ValueError( - "NestedDtype can only be constructed with pyarrow struct type, all fields must be list " - f"type. Given struct has unsupported field {field}" - ) - return pyarrow_dtype + return NestedExtensionArray @classmethod def construct_from_string(cls, string: str) -> Self: # type: ignore[name-defined] # noqa: F821 @@ -135,6 +114,91 @@ def construct_from_string(cls, string: str) -> Self: # type: ignore[name-define return cls.from_fields(fields) + # ArrowDtype would return None so we do + def _get_common_dtype(self, dtypes: list) -> None: + return None + + # Optional methods # + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray: + """Construct a NestedExtensionArray from a pyarrow array. + + Parameters + ---------- + array : pa.Array | pa.ChunkedArray + The input pyarrow array. + + Returns + ------- + NestedExtensionArray + The constructed NestedExtensionArray. + """ + from nested_pandas.series.ext_array import NestedExtensionArray + + return NestedExtensionArray(array) + + # Additional methods and attributes # + + pyarrow_dtype: pa.StructType + + def __init__(self, pyarrow_dtype: pa.DataType) -> None: + self.pyarrow_dtype = self._validate_dtype(pyarrow_dtype) + + @classmethod + def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 + """Make NestedDtype from a mapping of field names and list item types. + + Parameters + ---------- + fields : Mapping[str, pa.DataType] + A mapping of field names and their item types. Since all fields are lists, the item types are + inner types of the lists, not the list types themselves. + + Returns + ------- + NestedDtype + The constructed NestedDtype. + + Examples + -------- + >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()}) + >>> dtype + nested + >>> assert ( + ... dtype.pyarrow_dtype + ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())}) + ... ) + """ + pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()}) + pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) + return cls(pyarrow_dtype=pyarrow_dtype) + + @staticmethod + def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType: + if not isinstance(pyarrow_dtype, pa.DataType): + raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}") + if not pa.types.is_struct(pyarrow_dtype): + raise ValueError("NestedDtype can only be constructed with pyarrow struct type.") + pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) + + for field in pyarrow_dtype: + if not is_pa_type_a_list(field.type): + raise ValueError( + "NestedDtype can only be constructed with pyarrow struct type, all fields must be list " + f"type. Given struct has unsupported field {field}" + ) + return pyarrow_dtype + + @property + def fields(self) -> dict[str, pa.DataType]: + """The mapping of field names and their item types.""" + return {field.name: field.type.value_type for field in self.pyarrow_dtype} + + @property + def field_names(self) -> list[str]: + """The list of field names of the nested type""" + return [field.name for field in self.pyarrow_dtype] + @classmethod def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype): """Construct NestedDtype from a pandas.ArrowDtype. @@ -154,21 +218,14 @@ def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype): ValueError If the given dtype is not a valid nested type. """ - pyarrow_dtype = cls._validate_dtype(pandas_arrow_dtype.pyarrow_dtype) - return cls(pyarrow_dtype=pyarrow_dtype) + return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype) - @classmethod - def construct_array_type(cls) -> type[ArrowExtensionArray]: - """Corresponded array type, always NestedExtensionArray""" - from nested_pandas.series.ext_array import NestedExtensionArray + def to_pandas_arrow_dtype(self) -> ArrowDtype: + """Convert NestedDtype to a pandas.ArrowDtype. - return NestedExtensionArray - - @property - def name(self) -> str: - """The string representation of the nested type""" - fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype]) - return f"nested<{fields}>" - - type = pd.DataFrame - """The type of the array's elements, always pd.DataFrame""" + Returns + ------- + ArrowDtype + The corresponding pandas.ArrowDtype. + """ + return ArrowDtype(self.pyarrow_dtype) diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py index 0abcd61..4c2d63f 100644 --- a/tests/nested_pandas/series/test_dtype.py +++ b/tests/nested_pandas/series/test_dtype.py @@ -1,7 +1,9 @@ +import pandas as pd import pyarrow as pa import pytest from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.ext_array import NestedExtensionArray +from nested_pandas.series.na import NA @pytest.mark.parametrize( @@ -40,6 +42,14 @@ def test_from_pyarrow_dtype_raises(pyarrow_dtype): NestedDtype(pyarrow_dtype) +def test_to_pandas_pyarrow_dtype(): + """Test that NestedDtype.to_pandas_pyarrow_dtype() returns the correct pyarrow struct type.""" + dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.int64()}) + assert dtype.to_pandas_pyarrow_dtype() == pd.ArrowDtype( + pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + ) + + def test_from_fields(): """Test NestedDtype.from_fields().""" fields = {"a": pa.int64(), "b": pa.float64()} @@ -49,6 +59,28 @@ def test_from_fields(): ) +def test_na_value(): + """Test that NestedDtype.na_value is a singleton instance of NAType.""" + dtype = NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))])) + assert dtype.na_value is NA + + +def test_fields(): + """Test NestedDtype.fields property""" + dtype = NestedDtype( + pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + ) + assert dtype.fields == {"a": pa.int64(), "b": pa.float64()} + + +def test_field_names(): + """Test NestedDtype.field_names property""" + dtype = NestedDtype( + pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + ) + assert dtype.field_names == ["a", "b"] + + @pytest.mark.parametrize( "fields", [ From 71a0b71395e6fdac81294970d8c85e9f8b653837 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Thu, 4 Apr 2024 10:07:31 -0400 Subject: [PATCH 3/3] dtype reimplementation NestedDtype doesn't inherit ArrowDtype anymore --- src/nested_pandas/series/accessor.py | 42 +++++++++++++++----- src/nested_pandas/series/ext_array.py | 11 +++++ src/nested_pandas/series/packer.py | 2 +- tests/nested_pandas/series/test_dtype.py | 8 ++-- tests/nested_pandas/series/test_ext_array.py | 31 +++++++++++++++ tests/nested_pandas/series/test_packer.py | 2 +- 6 files changed, 79 insertions(+), 17 deletions(-) diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 49876fb..9f1a719 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -49,10 +49,24 @@ def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame: pd.DataFrame Dataframe of list-arrays. """ - df = self._series.struct.explode() - if fields is None: - return df - return df[fields] + fields = fields if fields is not None else list(self._series.array.field_names) + if len(fields) == 0: + raise ValueError("Cannot convert a struct with no fields to lists") + + struct_array = cast(pa.StructArray, pa.array(self._series)) + + list_series = {} + for field in fields: + list_array = cast(pa.ListArray, struct_array.field(field)) + list_series[field] = pd.Series( + list_array, + dtype=pd.ArrowDtype(list_array.type), + index=self._series.index, + name=field, + copy=False, + ) + + return pd.DataFrame(list_series) def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame: """Convert nested series into dataframe of flat arrays @@ -67,15 +81,16 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame: pd.DataFrame Dataframe of flat arrays. """ - # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly fields = fields if fields is not None else list(self._series.array.field_names) if len(fields) == 0: raise ValueError("Cannot flatten a struct with no fields") + struct_array = cast(pa.StructArray, pa.array(self._series)) + flat_series = {} index = None for field in fields: - list_array = cast(pa.ListArray, pa.array(self._series.struct.field(field))) + list_array = cast(pa.ListArray, struct_array.field(field)) if index is None: index = np.repeat(self._series.index.values, np.diff(list_array.offsets)) flat_series[field] = pd.Series( @@ -94,7 +109,6 @@ def flat_length(self) -> int: @property def fields(self) -> list[str]: """Names of the nested columns""" - # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly return self._series.array.field_names def set_flat_field(self, field: str, value: ArrayLike) -> None: @@ -176,14 +190,22 @@ def get_list_series(self, field: str) -> pd.Series: pd.Series The list-array field. """ - return self._series.struct.field(field) + struct_array = cast(pa.StructArray, pa.array(self._series)) + list_array = struct_array.field(field) + return pd.Series( + list_array, + dtype=pd.ArrowDtype(list_array.type), + index=self._series.index, + name=field, + copy=False, + ) def __getitem__(self, key: str | list[str]) -> pd.Series: if isinstance(key, list): new_array = self._series.array.view_fields(key) return pd.Series(new_array, index=self._series.index, name=self._series.name) - series = self._series.struct.field(key).list.flatten() + series = self.get_list_series(key).list.flatten() series.index = np.repeat(self._series.index.values, np.diff(self._series.array.list_offsets)) series.name = key return series @@ -232,9 +254,7 @@ def __delitem__(self, key: str) -> None: self.pop_field(key) def __iter__(self) -> Generator[str, None, None]: - # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly yield from iter(self._series.array.field_names) def __len__(self) -> int: - # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly return len(self._series.array.field_names) diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index cfba3e7..df0b99f 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -80,6 +80,8 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # # The previous line may return an iterator, but parent's _from_sequence needs Sequence if not isinstance(scalars, Sequence) and isinstance(scalars, Collection): scalars = list(scalars) + if isinstance(dtype, NestedDtype): + dtype = dtype.to_pandas_arrow_dtype() return super()._from_sequence(scalars, dtype=dtype, copy=copy) @staticmethod @@ -103,6 +105,15 @@ def _validate(array: pa.ChunkedArray) -> None: if not first_list_array.offsets.equals(list_array.offsets): raise ValueError("Offsets of all ListArrays must be the same") + @classmethod + def from_arrow_ext_array(cls, array: ArrowExtensionArray) -> Self: # type: ignore[name-defined] # noqa: F821 + """Create a NestedExtensionArray from pandas' ArrowExtensionArray""" + return cls(array._pa_array) + + def to_arrow_ext_array(self) -> ArrowExtensionArray: + """Convert the extension array to pandas' ArrowExtensionArray""" + return ArrowExtensionArray(self._pa_array) + def _replace_pa_array(self, pa_array: pa.ChunkedArray, *, validate: bool) -> None: if validate: self._validate(pa_array) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index cca03c0..0bb25c4 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -46,7 +46,7 @@ def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame: """ # TODO: we can optimize name=None case a bit struct_series = pack_flat(df, name=name) - packed_df = struct_series.struct.explode() + packed_df = struct_series.nest.to_lists() if name is not None: packed_df[name] = struct_series return packed_df diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py index 4c2d63f..54c40bf 100644 --- a/tests/nested_pandas/series/test_dtype.py +++ b/tests/nested_pandas/series/test_dtype.py @@ -42,10 +42,10 @@ def test_from_pyarrow_dtype_raises(pyarrow_dtype): NestedDtype(pyarrow_dtype) -def test_to_pandas_pyarrow_dtype(): - """Test that NestedDtype.to_pandas_pyarrow_dtype() returns the correct pyarrow struct type.""" - dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.int64()}) - assert dtype.to_pandas_pyarrow_dtype() == pd.ArrowDtype( +def test_to_pandas_arrow_dtype(): + """Test that NestedDtype.to_pandas_arrow_dtype() returns the correct pyarrow struct type.""" + dtype = NestedDtype.from_fields({"a": pa.int64(), "b": pa.float64()}) + assert dtype.to_pandas_arrow_dtype() == pd.ArrowDtype( pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) ) diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py index c4d9ead..2141d5c 100644 --- a/tests/nested_pandas/series/test_ext_array.py +++ b/tests/nested_pandas/series/test_ext_array.py @@ -5,6 +5,7 @@ from nested_pandas import NestedDtype from nested_pandas.series.ext_array import NestedExtensionArray from numpy.testing import assert_array_equal +from pandas.core.arrays import ArrowExtensionArray from pandas.testing import assert_frame_equal, assert_series_equal @@ -626,3 +627,33 @@ def test_delete_last_field_raises(): with pytest.raises(ValueError): ext_array.pop_field("b") + + +def test_from_arrow_ext_array(): + """Tests that we can create a NestedExtensionArray from an ArrowExtensionArray.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1, 2])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = ArrowExtensionArray(struct_array) + + from_arrow = NestedExtensionArray.from_arrow_ext_array(ext_array) + assert_series_equal(pd.Series(ext_array), pd.Series(from_arrow), check_dtype=False) + + +def test_to_arrow_ext_array(): + """Tests that we can create an ArrowExtensionArray from a NestedExtensionArray.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1, 2])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + to_arrow = ext_array.to_arrow_ext_array() + assert_series_equal(pd.Series(ext_array), pd.Series(to_arrow), check_dtype=False) diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index 5859a9d..351572b 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -129,7 +129,7 @@ def test_pack_lists(): series = packer.pack_lists(packed_df) for field_name in packed_df.columns: - assert_series_equal(series.struct.field(field_name), packed_df[field_name]) + assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name]) def test_pack_dfs():