Skip to content

Commit

Permalink
Accessor(MutableMapping) -> Mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
hombit committed May 29, 2024
1 parent 41cce93 commit d86f37e
Show file tree
Hide file tree
Showing 4 changed files with 322 additions and 87 deletions.
80 changes: 56 additions & 24 deletions src/nested_pandas/series/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from __future__ import annotations

from collections import defaultdict
from collections.abc import Generator, MutableMapping
from collections.abc import Generator, Mapping
from typing import cast

import numpy as np
Expand All @@ -18,7 +18,7 @@


@register_series_accessor("nest")
class NestSeriesAccessor(MutableMapping):
class NestSeriesAccessor(Mapping):
"""Accessor for operations on Series of NestedDtype
This accessor implements `MutableMapping` interface over the fields of the
Expand Down Expand Up @@ -124,8 +124,8 @@ def fields(self) -> list[str]:
"""Names of the nested columns"""
return self._series.array.field_names

def set_flat_field(self, field: str, value: ArrayLike) -> None:
"""Set the field from flat-array of values, in-place
def with_flat_field(self, field: str, value: ArrayLike) -> pd.Series:
"""Set the field from flat-array of values and return a new series
Parameters
----------
Expand All @@ -134,11 +134,18 @@ def set_flat_field(self, field: str, value: ArrayLike) -> None:
value : ArrayLike
Array of values to set. It must be a scalar or have the same length
as the flat arrays, e.g. `self.flat_length`.
Returns
-------
pd.Series
The new series with the field set.
"""
self._series.array.set_flat_field(field, value)
new_array = self._series.array.copy()
new_array.set_flat_field(field, value)
return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)

def set_list_field(self, field: str, value: ArrayLike) -> None:
"""Set the field from list-array, in-place
def with_list_field(self, field: str, value: ArrayLike) -> pd.Series:
"""Set the field from list-array of values and return a new series
Parameters
----------
Expand All @@ -147,27 +154,37 @@ def set_list_field(self, field: str, value: ArrayLike) -> None:
value : ArrayLike
Array of values to set. It must be a list-array of the same length
as the series, e.g. length of the series.
Returns
-------
pd.Series
The new series with the field set.
"""
self._series.array.set_list_field(field, value)
new_array = self._series.array.copy()
new_array.set_list_field(field, value)
return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)

def without_field(self, field: str | list[str]) -> pd.Series:
"""Remove the field(s) from the series and return a new series
# I intentionally don't call it `drop` or `drop_field` because `pd.DataFrame.drop` is not inplace
# by default, and I wouldn't like to surprise the user.
def pop_field(self, field: str) -> pd.Series:
"""Delete the field from the struct and return it.
Note, that at least one field must be left in the series.
Parameters
----------
field : str
Name of the field to delete.
field : str or list[str]
Name of the field(s) to remove.
Returns
-------
pd.Series
The deleted field.
The new series without the field(s).
"""
series = self[field]
self._series.array.pop_field(field)
return series
if isinstance(field, str):
field = [field]

new_array = self._series.array.copy()
new_array.pop_fields(field)
return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)

def query_flat(self, query: str) -> pd.Series:
"""Query the flat arrays with a boolean expression
Expand Down Expand Up @@ -255,6 +272,12 @@ def __getitem__(self, key: str | list[str]) -> pd.Series:
return self.get_flat_series(key)

def __setitem__(self, key: str, value: ArrayLike) -> None:
"""Replace the field values from flat-array of values
Currently, only replacement of the whole field is supported, the length
and dtype of the input value must match the field.
https://github.com/lincc-frameworks/nested-pandas/issues/87
"""
# TODO: we can be much-much smarter about the performance here
# TODO: think better about underlying pa.ChunkArray in both self._series.array and value

Expand All @@ -268,7 +291,7 @@ def __setitem__(self, key: str, value: ArrayLike) -> None:

# Set single value for all rows
if ndim == 0:
self.set_flat_field(key, value)
self._series.array.set_flat_field(key, value, keep_dtype=True)
return

if isinstance(value, pd.Series) and not self.get_flat_index().equals(value.index):
Expand All @@ -284,13 +307,22 @@ def __setitem__(self, key: str, value: ArrayLike) -> None:
f"{len(self._series)}."
)

self.set_flat_field(key, pa_array)

def __delitem__(self, key: str) -> None:
self.pop_field(key)
self._series.array.set_flat_field(key, pa_array, keep_dtype=True)

def __iter__(self) -> Generator[str, None, None]:
yield from iter(self._series.array.field_names)
return iter(self._series.array.field_names)

def __len__(self) -> int:
return len(self._series.array.field_names)

def __eq__(self, other) -> bool:
if not isinstance(other, type(self)):
return False
return self._series.equals(other._series)

def clear(self) -> None:
"""Mandatory MutableMapping method, always fails with NotImplementedError
The reason is that we cannot delete all nested fields from the nested series.
"""
raise NotImplementedError("Cannot delete fields from nested series")
93 changes: 74 additions & 19 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

from collections.abc import Iterator, Sequence
from collections.abc import Iterable, Iterator, Sequence
from typing import Any, Callable, cast

import numpy as np
Expand Down Expand Up @@ -676,43 +676,94 @@ def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-def

return self.__class__(pa_array, validate=False)

def set_flat_field(self, field: str, value: ArrayLike) -> None:
def set_flat_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = False) -> None:
"""Set the field from flat-array of values
Note that if this updates the dtype, it would not affect the dtype of
the pd.Series back-ended by this extension array.
Parameters
----------
field : str
The name of the field.
value : ArrayLike
The 'flat' array of values to be set.
keep_dtype : bool, default False
Whether to keep the original dtype of the field. If True,
now new field will be created, and the dtype of the existing
field will be kept. If False, the dtype of the field will be
inferred from the input value.
"""
# TODO: optimize for the case when the input is a pa.ChunkedArray

if keep_dtype:
if field not in self.field_names:
raise ValueError(
"If keep_dtype is True, the field must exist in the series. "
f"Got: {field}, available: {self.field_names}"
)
# Get the current element type of list-array
pa_type = self._pa_array.chunk(0).field(field).type.value_type
else:
pa_type = None

if np.ndim(value) == 0:
value = np.repeat(value, self.flat_length)

pa_array = pa.array(value)
try:
pa_array = pa.array(value, from_pandas=True, type=pa_type)
except (ValueError, TypeError) as e:
raise TypeError(
f"New values must be convertible to the existing element pyarrow type, {pa_type}. "
"If you want to replace field with values of a new type, use series.nest.with_flat_field() "
"or NestedExtensionArray.set_flat_field(..., keep_dtype=False) instead."
) from e

if len(pa_array) != self.flat_length:
raise ValueError("The input must be a struct_scalar or have the same length as the flat arrays")

list_array = pa.ListArray.from_arrays(values=pa_array, offsets=self.list_offsets)

return self.set_list_field(field, list_array)
return self.set_list_field(field, list_array, keep_dtype=keep_dtype)

def set_list_field(self, field: str, value: ArrayLike) -> None:
def set_list_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = False) -> None:
"""Set the field from list-array
Note that if this updates the dtype, it would not affect the dtype of
the pd.Series back-ended by this extension array.
Parameters
----------
field : str
The name of the field.
value : ArrayLike
The list-array of values to be set.
keep_dtype : bool, default False
Whether to keep the original dtype of the field. If True,
now new field will be created, and the dtype of the existing
field will be kept. If False, the dtype of the field will be
inferred from the input value.
"""
# TODO: optimize for the case when the input is a pa.ChunkedArray

pa_array = pa.array(value)
if keep_dtype:
if field not in self.field_names:
raise ValueError(
"If keep_dtype is True, the field must exist in the series. "
f"Got: {field}, available: {self.field_names}"
)
pa_type = self._pa_array.chunk(0).field(field).type
else:
pa_type = None

try:
pa_array = pa.array(value, from_pandas=True, type=pa_type)
except (ValueError, TypeError) as e:
raise TypeError(
f"New values must be convertible to the existing list pyarrow type, {pa_type}. "
"If you want to replace field with values of a new type, use series.nest.with_list_field() "
"or NestedExtensionArray.set_list_field(..., keep_dtype=False) instead."
) from e

if not is_pa_type_a_list(pa_array.type):
raise ValueError(f"Expected a list array, got {pa_array.type}")
Expand All @@ -724,38 +775,42 @@ def set_list_field(self, field: str, value: ArrayLike) -> None:
for sl, chunk in enumerate_chunks(self._pa_array):
chunk = cast(pa.StructArray, chunk)

# Build a new struct array. We collect all existing fields and add the new one.
# Build a new struct array. We collect all existing fields and add/replace the new one.
struct_dict = {}
for pa_field in chunk.type:
struct_dict[pa_field.name] = chunk.field(pa_field.name)
struct_dict[field] = pa.array(pa_array[sl])
struct_dict[field] = pa_array[sl]

struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys())
chunks.append(struct_array)
pa_array = pa.chunked_array(chunks)
chunked_array = pa.chunked_array(chunks)

self._replace_pa_array(chunked_array, validate=True)

self._replace_pa_array(pa_array, validate=True)
def pop_fields(self, fields: Iterable[str]):
"""Delete fields from the struct array
def pop_field(self, field: str):
"""Delete a field from the struct array
Note that at least one field must be left in the struct array.
Parameters
----------
field : str
The name of the field to be deleted.
fields : iterable of str
The names of the fields to delete.
"""
if field not in self.field_names:
raise ValueError(f"Field '{field}' not found")
fields = frozenset(fields)

if not fields.issubset(self.field_names):
raise ValueError(f"Some fields are not found, given: {fields}, available: {self.field_names}")

if len(self.field_names) == 1:
raise ValueError("Cannot delete the last field")
if len(self.field_names) - len(fields) == 0:
raise ValueError("Cannot delete all fields")

chunks = []
for chunk in self._pa_array.iterchunks():
chunk = cast(pa.StructArray, chunk)
struct_dict = {}
for pa_field in chunk.type:
if pa_field.name != field:
if pa_field.name not in fields:
struct_dict[pa_field.name] = chunk.field(pa_field.name)
struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys())
chunks.append(struct_array)
Expand Down
Loading

0 comments on commit d86f37e

Please sign in to comment.