Skip to content

Commit

Permalink
Merge pull request #6 from lincc-frameworks/dtype-reimpl
Browse files Browse the repository at this point in the history
Dtype reimplementation
  • Loading branch information
hombit authored Apr 4, 2024
2 parents e04f558 + 71a0b71 commit 70afe87
Show file tree
Hide file tree
Showing 9 changed files with 332 additions and 76 deletions.
42 changes: 31 additions & 11 deletions src/nested_pandas/series/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,24 @@ def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame:
pd.DataFrame
Dataframe of list-arrays.
"""
df = self._series.struct.explode()
if fields is None:
return df
return df[fields]
fields = fields if fields is not None else list(self._series.array.field_names)
if len(fields) == 0:
raise ValueError("Cannot convert a struct with no fields to lists")

struct_array = cast(pa.StructArray, pa.array(self._series))

list_series = {}
for field in fields:
list_array = cast(pa.ListArray, struct_array.field(field))
list_series[field] = pd.Series(
list_array,
dtype=pd.ArrowDtype(list_array.type),
index=self._series.index,
name=field,
copy=False,
)

return pd.DataFrame(list_series)

def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
"""Convert nested series into dataframe of flat arrays
Expand All @@ -67,15 +81,16 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
pd.DataFrame
Dataframe of flat arrays.
"""
# For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
fields = fields if fields is not None else list(self._series.array.field_names)
if len(fields) == 0:
raise ValueError("Cannot flatten a struct with no fields")

struct_array = cast(pa.StructArray, pa.array(self._series))

flat_series = {}
index = None
for field in fields:
list_array = cast(pa.ListArray, pa.array(self._series.struct.field(field)))
list_array = cast(pa.ListArray, struct_array.field(field))
if index is None:
index = np.repeat(self._series.index.values, np.diff(list_array.offsets))
flat_series[field] = pd.Series(
Expand All @@ -94,7 +109,6 @@ def flat_length(self) -> int:
@property
def fields(self) -> list[str]:
"""Names of the nested columns"""
# For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
return self._series.array.field_names

def set_flat_field(self, field: str, value: ArrayLike) -> None:
Expand Down Expand Up @@ -176,14 +190,22 @@ def get_list_series(self, field: str) -> pd.Series:
pd.Series
The list-array field.
"""
return self._series.struct.field(field)
struct_array = cast(pa.StructArray, pa.array(self._series))
list_array = struct_array.field(field)
return pd.Series(
list_array,
dtype=pd.ArrowDtype(list_array.type),
index=self._series.index,
name=field,
copy=False,
)

def __getitem__(self, key: str | list[str]) -> pd.Series:
if isinstance(key, list):
new_array = self._series.array.view_fields(key)
return pd.Series(new_array, index=self._series.index, name=self._series.name)

series = self._series.struct.field(key).list.flatten()
series = self.get_list_series(key).list.flatten()
series.index = np.repeat(self._series.index.values, np.diff(self._series.array.list_offsets))
series.name = key
return series
Expand Down Expand Up @@ -232,9 +254,7 @@ def __delitem__(self, key: str) -> None:
self.pop_field(key)

def __iter__(self) -> Generator[str, None, None]:
# For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
yield from iter(self._series.array.field_names)

def __len__(self) -> int:
# For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly
return len(self._series.array.field_names)
183 changes: 120 additions & 63 deletions src/nested_pandas/series/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,73 +2,52 @@
from __future__ import annotations

from collections.abc import Mapping
from typing import cast

# We use Type, because we must use "type" as an attribute name
from typing import Type, cast # noqa: UP035

import pandas as pd
import pyarrow as pa
from pandas import ArrowDtype
from pandas.api.extensions import register_extension_dtype
from pandas.core.arrays import ArrowExtensionArray
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.base import ExtensionDtype

from nested_pandas.series.na import NA, NAType
from nested_pandas.series.utils import is_pa_type_a_list

__all__ = ["NestedDtype"]


@register_extension_dtype
class NestedDtype(ArrowDtype):
class NestedDtype(ExtensionDtype):
"""Data type to handle packed time series data"""

pyarrow_dtype: pa.StructType
# ExtensionDtype overrides #

def __init__(self, pyarrow_dtype: pa.DataType) -> None:
pyarrow_dtype = self._validate_dtype(pyarrow_dtype)
super().__init__(pyarrow_dtype=pyarrow_dtype)
_metadata = ("pyarrow_dtype",)
"""Attributes to use as metadata for __eq__ and __hash__"""

@classmethod
def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Make NestedDtype from a mapping of field names and list item types.
Parameters
----------
fields : Mapping[str, pa.DataType]
A mapping of field names and their item types. Since all fields are lists, the item types are
inner types of the lists, not the list types themselves.
@property
def na_value(self) -> NAType:
"""The missing value for this dtype"""
return NA

Returns
-------
NestedDtype
The constructed NestedDtype.
type = pd.DataFrame
"""The type of the array's elements, always pd.DataFrame"""

Examples
--------
>>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()})
>>> dtype
nested<a: [double], b: [int64]>
>>> assert (
... dtype.pyarrow_dtype
... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
... )
"""
pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()})
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
return cls(pyarrow_dtype=pyarrow_dtype)
@property
def name(self) -> str:
"""The string representation of the nested type"""
fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype])
return f"nested<{fields}>"

@staticmethod
def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType:
if not isinstance(pyarrow_dtype, pa.DataType):
raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
if not pa.types.is_struct(pyarrow_dtype):
raise ValueError("NestedDtype can only be constructed with pyarrow struct type.")
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
@classmethod
def construct_array_type(cls) -> Type[ExtensionArray]:
"""Corresponded array type, always NestedExtensionArray"""
from nested_pandas.series.ext_array import NestedExtensionArray

for field in pyarrow_dtype:
if not is_pa_type_a_list(field.type):
raise ValueError(
"NestedDtype can only be constructed with pyarrow struct type, all fields must be list "
f"type. Given struct has unsupported field {field}"
)
return pyarrow_dtype
return NestedExtensionArray

@classmethod
def construct_from_string(cls, string: str) -> Self: # type: ignore[name-defined] # noqa: F821
Expand Down Expand Up @@ -135,6 +114,91 @@ def construct_from_string(cls, string: str) -> Self: # type: ignore[name-define

return cls.from_fields(fields)

# ArrowDtype would return None so we do
def _get_common_dtype(self, dtypes: list) -> None:
return None

# Optional methods #

def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray:
"""Construct a NestedExtensionArray from a pyarrow array.
Parameters
----------
array : pa.Array | pa.ChunkedArray
The input pyarrow array.
Returns
-------
NestedExtensionArray
The constructed NestedExtensionArray.
"""
from nested_pandas.series.ext_array import NestedExtensionArray

return NestedExtensionArray(array)

# Additional methods and attributes #

pyarrow_dtype: pa.StructType

def __init__(self, pyarrow_dtype: pa.DataType) -> None:
self.pyarrow_dtype = self._validate_dtype(pyarrow_dtype)

@classmethod
def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Make NestedDtype from a mapping of field names and list item types.
Parameters
----------
fields : Mapping[str, pa.DataType]
A mapping of field names and their item types. Since all fields are lists, the item types are
inner types of the lists, not the list types themselves.
Returns
-------
NestedDtype
The constructed NestedDtype.
Examples
--------
>>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()})
>>> dtype
nested<a: [double], b: [int64]>
>>> assert (
... dtype.pyarrow_dtype
... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
... )
"""
pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()})
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
return cls(pyarrow_dtype=pyarrow_dtype)

@staticmethod
def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType:
if not isinstance(pyarrow_dtype, pa.DataType):
raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
if not pa.types.is_struct(pyarrow_dtype):
raise ValueError("NestedDtype can only be constructed with pyarrow struct type.")
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)

for field in pyarrow_dtype:
if not is_pa_type_a_list(field.type):
raise ValueError(
"NestedDtype can only be constructed with pyarrow struct type, all fields must be list "
f"type. Given struct has unsupported field {field}"
)
return pyarrow_dtype

@property
def fields(self) -> dict[str, pa.DataType]:
"""The mapping of field names and their item types."""
return {field.name: field.type.value_type for field in self.pyarrow_dtype}

@property
def field_names(self) -> list[str]:
"""The list of field names of the nested type"""
return [field.name for field in self.pyarrow_dtype]

@classmethod
def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype):
"""Construct NestedDtype from a pandas.ArrowDtype.
Expand All @@ -154,21 +218,14 @@ def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype):
ValueError
If the given dtype is not a valid nested type.
"""
pyarrow_dtype = cls._validate_dtype(pandas_arrow_dtype.pyarrow_dtype)
return cls(pyarrow_dtype=pyarrow_dtype)
return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype)

@classmethod
def construct_array_type(cls) -> type[ArrowExtensionArray]:
"""Corresponded array type, always NestedExtensionArray"""
from nested_pandas.series.ext_array import NestedExtensionArray
def to_pandas_arrow_dtype(self) -> ArrowDtype:
"""Convert NestedDtype to a pandas.ArrowDtype.
return NestedExtensionArray

@property
def name(self) -> str:
"""The string representation of the nested type"""
fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype])
return f"nested<{fields}>"

type = pd.DataFrame
"""The type of the array's elements, always pd.DataFrame"""
Returns
-------
ArrowDtype
The corresponding pandas.ArrowDtype.
"""
return ArrowDtype(self.pyarrow_dtype)
11 changes: 11 additions & 0 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: #
# The previous line may return an iterator, but parent's _from_sequence needs Sequence
if not isinstance(scalars, Sequence) and isinstance(scalars, Collection):
scalars = list(scalars)
if isinstance(dtype, NestedDtype):
dtype = dtype.to_pandas_arrow_dtype()
return super()._from_sequence(scalars, dtype=dtype, copy=copy)

@staticmethod
Expand All @@ -103,6 +105,15 @@ def _validate(array: pa.ChunkedArray) -> None:
if not first_list_array.offsets.equals(list_array.offsets):
raise ValueError("Offsets of all ListArrays must be the same")

@classmethod
def from_arrow_ext_array(cls, array: ArrowExtensionArray) -> Self: # type: ignore[name-defined] # noqa: F821
"""Create a NestedExtensionArray from pandas' ArrowExtensionArray"""
return cls(array._pa_array)

def to_arrow_ext_array(self) -> ArrowExtensionArray:
"""Convert the extension array to pandas' ArrowExtensionArray"""
return ArrowExtensionArray(self._pa_array)

def _replace_pa_array(self, pa_array: pa.ChunkedArray, *, validate: bool) -> None:
if validate:
self._validate(pa_array)
Expand Down
55 changes: 55 additions & 0 deletions src/nested_pandas/series/na.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Missing value for NestedDtype
It i something between pandas' NA and NaN
"""

__all__ = ["NAType", "NA"]


class _NAType:
pass


class NAType:
"""Singleton class representing missing value for NestedDtype.
It doesn't implement most of the arithmetics and boolean logic operations,
because they are ambiguous for missing values.
The implementation is inspired both by pandas' NA and float number NaN.
`NA` is a singleton instance of this class.
"""

_instance = None

def __new__(cls, *args, **kwargs):
"""Create a new instance of NAType."""
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance

def __repr__(self) -> str:
return "<NA>"

def __format__(self, format_spec) -> str:
try:
return self.__repr__().__format__(format_spec)
except ValueError:
return self.__repr__()

def __bool__(self):
raise TypeError("boolean value of NA is ambiguous")

def __eq__(self, other):
return False

def __ne__(self, other):
return True

def __hash__(self):
return 0


NA = NAType()
"""Missed value for NestedDtype, a singleton instance of `NAType` class."""
Loading

0 comments on commit 70afe87

Please sign in to comment.