Merge pull request #6 from lincc-frameworks/dtype-reimpl

Dtype reimplementation
lincc-frameworks · Apr 4, 2024 · 70afe87 · 70afe87
2 parents e04f558 + 71a0b71
commit 70afe87
Show file tree

Hide file tree

Showing 9 changed files with 332 additions and 76 deletions.
diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py
@@ -49,10 +49,24 @@ def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame:
         pd.DataFrame
             Dataframe of list-arrays.
         """
-        df = self._series.struct.explode()
-        if fields is None:
-            return df
-        return df[fields]
+        fields = fields if fields is not None else list(self._series.array.field_names)
+        if len(fields) == 0:
+            raise ValueError("Cannot convert a struct with no fields to lists")
+
+        struct_array = cast(pa.StructArray, pa.array(self._series))
+
+        list_series = {}
+        for field in fields:
+            list_array = cast(pa.ListArray, struct_array.field(field))
+            list_series[field] = pd.Series(
+                list_array,
+                dtype=pd.ArrowDtype(list_array.type),
+                index=self._series.index,
+                name=field,
+                copy=False,
+            )
+
+        return pd.DataFrame(list_series)
 
     def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
         """Convert nested series into dataframe of flat arrays
@@ -67,15 +81,16 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
         pd.DataFrame
             Dataframe of flat arrays.
         """
-        # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
         fields = fields if fields is not None else list(self._series.array.field_names)
         if len(fields) == 0:
             raise ValueError("Cannot flatten a struct with no fields")
 
+        struct_array = cast(pa.StructArray, pa.array(self._series))
+
         flat_series = {}
         index = None
         for field in fields:
-            list_array = cast(pa.ListArray, pa.array(self._series.struct.field(field)))
+            list_array = cast(pa.ListArray, struct_array.field(field))
             if index is None:
                 index = np.repeat(self._series.index.values, np.diff(list_array.offsets))
             flat_series[field] = pd.Series(
@@ -94,7 +109,6 @@ def flat_length(self) -> int:
     @property
     def fields(self) -> list[str]:
         """Names of the nested columns"""
-        # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
         return self._series.array.field_names
 
     def set_flat_field(self, field: str, value: ArrayLike) -> None:
@@ -176,14 +190,22 @@ def get_list_series(self, field: str) -> pd.Series:
         pd.Series
             The list-array field.
         """
-        return self._series.struct.field(field)
+        struct_array = cast(pa.StructArray, pa.array(self._series))
+        list_array = struct_array.field(field)
+        return pd.Series(
+            list_array,
+            dtype=pd.ArrowDtype(list_array.type),
+            index=self._series.index,
+            name=field,
+            copy=False,
+        )
 
     def __getitem__(self, key: str | list[str]) -> pd.Series:
         if isinstance(key, list):
             new_array = self._series.array.view_fields(key)
             return pd.Series(new_array, index=self._series.index, name=self._series.name)
 
-        series = self._series.struct.field(key).list.flatten()
+        series = self.get_list_series(key).list.flatten()
         series.index = np.repeat(self._series.index.values, np.diff(self._series.array.list_offsets))
         series.name = key
         return series
@@ -232,9 +254,7 @@ def __delitem__(self, key: str) -> None:
         self.pop_field(key)
 
     def __iter__(self) -> Generator[str, None, None]:
-        # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
         yield from iter(self._series.array.field_names)
 
     def __len__(self) -> int:
-        # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
         return len(self._series.array.field_names)
diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py
@@ -2,73 +2,52 @@
 from __future__ import annotations
 
 from collections.abc import Mapping
-from typing import cast
+
+# We use Type, because we must use "type" as an attribute name
+from typing import Type, cast  # noqa: UP035
 
 import pandas as pd
 import pyarrow as pa
 from pandas import ArrowDtype
 from pandas.api.extensions import register_extension_dtype
-from pandas.core.arrays import ArrowExtensionArray
+from pandas.core.arrays import ExtensionArray
+from pandas.core.dtypes.base import ExtensionDtype
 
+from nested_pandas.series.na import NA, NAType
 from nested_pandas.series.utils import is_pa_type_a_list
 
 __all__ = ["NestedDtype"]
 
 
 @register_extension_dtype
-class NestedDtype(ArrowDtype):
+class NestedDtype(ExtensionDtype):
     """Data type to handle packed time series data"""
 
-    pyarrow_dtype: pa.StructType
+    # ExtensionDtype overrides #
 
-    def __init__(self, pyarrow_dtype: pa.DataType) -> None:
-        pyarrow_dtype = self._validate_dtype(pyarrow_dtype)
-        super().__init__(pyarrow_dtype=pyarrow_dtype)
+    _metadata = ("pyarrow_dtype",)
+    """Attributes to use as metadata for __eq__ and __hash__"""
 
-    @classmethod
-    def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self:  # type: ignore[name-defined] # noqa: F821
-        """Make NestedDtype from a mapping of field names and list item types.
-
-        Parameters
-        ----------
-        fields : Mapping[str, pa.DataType]
-            A mapping of field names and their item types. Since all fields are lists, the item types are
-            inner types of the lists, not the list types themselves.
+    @property
+    def na_value(self) -> NAType:
+        """The missing value for this dtype"""
+        return NA
 
-        Returns
-        -------
-        NestedDtype
-            The constructed NestedDtype.
+    type = pd.DataFrame
+    """The type of the array's elements, always pd.DataFrame"""
 
-        Examples
-        --------
-        >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()})
-        >>> dtype
-        nested<a: [double], b: [int64]>
-        >>> assert (
-        ...     dtype.pyarrow_dtype
-        ...     == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
-        ... )
-        """
-        pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()})
-        pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
-        return cls(pyarrow_dtype=pyarrow_dtype)
+    @property
+    def name(self) -> str:
+        """The string representation of the nested type"""
+        fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype])
+        return f"nested<{fields}>"
 
-    @staticmethod
-    def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType:
-        if not isinstance(pyarrow_dtype, pa.DataType):
-            raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
-        if not pa.types.is_struct(pyarrow_dtype):
-            raise ValueError("NestedDtype can only be constructed with pyarrow struct type.")
-        pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
+    @classmethod
+    def construct_array_type(cls) -> Type[ExtensionArray]:
+        """Corresponded array type, always NestedExtensionArray"""
+        from nested_pandas.series.ext_array import NestedExtensionArray
 
-        for field in pyarrow_dtype:
-            if not is_pa_type_a_list(field.type):
-                raise ValueError(
-                    "NestedDtype can only be constructed with pyarrow struct type, all fields must be list "
-                    f"type. Given struct has unsupported field {field}"
-                )
-        return pyarrow_dtype
+        return NestedExtensionArray
 
     @classmethod
     def construct_from_string(cls, string: str) -> Self:  # type: ignore[name-defined] # noqa: F821
@@ -135,6 +114,91 @@ def construct_from_string(cls, string: str) -> Self:  # type: ignore[name-define
 
         return cls.from_fields(fields)
 
+    # ArrowDtype would return None so we do
+    def _get_common_dtype(self, dtypes: list) -> None:
+        return None
+
+    # Optional methods #
+
+    def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray:
+        """Construct a NestedExtensionArray from a pyarrow array.
+
+        Parameters
+        ----------
+        array : pa.Array | pa.ChunkedArray
+            The input pyarrow array.
+
+        Returns
+        -------
+        NestedExtensionArray
+            The constructed NestedExtensionArray.
+        """
+        from nested_pandas.series.ext_array import NestedExtensionArray
+
+        return NestedExtensionArray(array)
+
+    # Additional methods and attributes #
+
+    pyarrow_dtype: pa.StructType
+
+    def __init__(self, pyarrow_dtype: pa.DataType) -> None:
+        self.pyarrow_dtype = self._validate_dtype(pyarrow_dtype)
+
+    @classmethod
+    def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self:  # type: ignore[name-defined] # noqa: F821
+        """Make NestedDtype from a mapping of field names and list item types.
+
+        Parameters
+        ----------
+        fields : Mapping[str, pa.DataType]
+            A mapping of field names and their item types. Since all fields are lists, the item types are
+            inner types of the lists, not the list types themselves.
+
+        Returns
+        -------
+        NestedDtype
+            The constructed NestedDtype.
+
+        Examples
+        --------
+        >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()})
+        >>> dtype
+        nested<a: [double], b: [int64]>
+        >>> assert (
+        ...     dtype.pyarrow_dtype
+        ...     == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
+        ... )
+        """
+        pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()})
+        pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
+        return cls(pyarrow_dtype=pyarrow_dtype)
+
+    @staticmethod
+    def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType:
+        if not isinstance(pyarrow_dtype, pa.DataType):
+            raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
+        if not pa.types.is_struct(pyarrow_dtype):
+            raise ValueError("NestedDtype can only be constructed with pyarrow struct type.")
+        pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
+
+        for field in pyarrow_dtype:
+            if not is_pa_type_a_list(field.type):
+                raise ValueError(
+                    "NestedDtype can only be constructed with pyarrow struct type, all fields must be list "
+                    f"type. Given struct has unsupported field {field}"
+                )
+        return pyarrow_dtype
+
+    @property
+    def fields(self) -> dict[str, pa.DataType]:
+        """The mapping of field names and their item types."""
+        return {field.name: field.type.value_type for field in self.pyarrow_dtype}
+
+    @property
+    def field_names(self) -> list[str]:
+        """The list of field names of the nested type"""
+        return [field.name for field in self.pyarrow_dtype]
+
     @classmethod
     def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype):
         """Construct NestedDtype from a pandas.ArrowDtype.
@@ -154,21 +218,14 @@ def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype):
         ValueError
             If the given dtype is not a valid nested type.
         """
-        pyarrow_dtype = cls._validate_dtype(pandas_arrow_dtype.pyarrow_dtype)
-        return cls(pyarrow_dtype=pyarrow_dtype)
+        return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype)
 
-    @classmethod
-    def construct_array_type(cls) -> type[ArrowExtensionArray]:
-        """Corresponded array type, always NestedExtensionArray"""
-        from nested_pandas.series.ext_array import NestedExtensionArray
+    def to_pandas_arrow_dtype(self) -> ArrowDtype:
+        """Convert NestedDtype to a pandas.ArrowDtype.
 
-        return NestedExtensionArray
-
-    @property
-    def name(self) -> str:
-        """The string representation of the nested type"""
-        fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype])
-        return f"nested<{fields}>"
-
-    type = pd.DataFrame
-    """The type of the array's elements, always pd.DataFrame"""
+        Returns
+        -------
+        ArrowDtype
+            The corresponding pandas.ArrowDtype.
+        """
+        return ArrowDtype(self.pyarrow_dtype)
diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -80,6 +80,8 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self:  #
         # The previous line may return an iterator, but parent's _from_sequence needs Sequence
         if not isinstance(scalars, Sequence) and isinstance(scalars, Collection):
             scalars = list(scalars)
+        if isinstance(dtype, NestedDtype):
+            dtype = dtype.to_pandas_arrow_dtype()
         return super()._from_sequence(scalars, dtype=dtype, copy=copy)
 
     @staticmethod
@@ -103,6 +105,15 @@ def _validate(array: pa.ChunkedArray) -> None:
                 if not first_list_array.offsets.equals(list_array.offsets):
                     raise ValueError("Offsets of all ListArrays must be the same")
 
+    @classmethod
+    def from_arrow_ext_array(cls, array: ArrowExtensionArray) -> Self:  # type: ignore[name-defined] # noqa: F821
+        """Create a NestedExtensionArray from pandas' ArrowExtensionArray"""
+        return cls(array._pa_array)
+
+    def to_arrow_ext_array(self) -> ArrowExtensionArray:
+        """Convert the extension array to pandas' ArrowExtensionArray"""
+        return ArrowExtensionArray(self._pa_array)
+
     def _replace_pa_array(self, pa_array: pa.ChunkedArray, *, validate: bool) -> None:
         if validate:
             self._validate(pa_array)

diff --git a/src/nested_pandas/series/na.py b/src/nested_pandas/series/na.py
@@ -0,0 +1,55 @@
+"""Missing value for NestedDtype
+
+It i something between pandas' NA and NaN
+"""
+
+__all__ = ["NAType", "NA"]
+
+
+class _NAType:
+    pass
+
+
+class NAType:
+    """Singleton class representing missing value for NestedDtype.
+
+    It doesn't implement most of the arithmetics and boolean logic operations,
+    because they are ambiguous for missing values.
+
+    The implementation is inspired both by pandas' NA and float number NaN.
+
+    `NA` is a singleton instance of this class.
+    """
+
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        """Create a new instance of NAType."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __repr__(self) -> str:
+        return "<NA>"
+
+    def __format__(self, format_spec) -> str:
+        try:
+            return self.__repr__().__format__(format_spec)
+        except ValueError:
+            return self.__repr__()
+
+    def __bool__(self):
+        raise TypeError("boolean value of NA is ambiguous")
+
+    def __eq__(self, other):
+        return False
+
+    def __ne__(self, other):
+        return True
+
+    def __hash__(self):
+        return 0
+
+
+NA = NAType()
+"""Missed value for NestedDtype, a singleton instance of `NAType` class."""