Accessor(MutableMapping) -> Mapping

lincc-frameworks · May 29, 2024 · d86f37e · d86f37e
1 parent 41cce93
commit d86f37e
Show file tree

Hide file tree

Showing 4 changed files with 322 additions and 87 deletions.
diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from collections import defaultdict
-from collections.abc import Generator, MutableMapping
+from collections.abc import Generator, Mapping
 from typing import cast
 
 import numpy as np
@@ -18,7 +18,7 @@
 
 
 @register_series_accessor("nest")
-class NestSeriesAccessor(MutableMapping):
+class NestSeriesAccessor(Mapping):
     """Accessor for operations on Series of NestedDtype
 
     This accessor implements `MutableMapping` interface over the fields of the
@@ -124,8 +124,8 @@ def fields(self) -> list[str]:
         """Names of the nested columns"""
         return self._series.array.field_names
 
-    def set_flat_field(self, field: str, value: ArrayLike) -> None:
-        """Set the field from flat-array of values, in-place
+    def with_flat_field(self, field: str, value: ArrayLike) -> pd.Series:
+        """Set the field from flat-array of values and return a new series
 
         Parameters
         ----------
@@ -134,11 +134,18 @@ def set_flat_field(self, field: str, value: ArrayLike) -> None:
         value : ArrayLike
             Array of values to set. It must be a scalar or have the same length
              as the flat arrays, e.g. `self.flat_length`.
+
+        Returns
+        -------
+        pd.Series
+            The new series with the field set.
         """
-        self._series.array.set_flat_field(field, value)
+        new_array = self._series.array.copy()
+        new_array.set_flat_field(field, value)
+        return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)
 
-    def set_list_field(self, field: str, value: ArrayLike) -> None:
-        """Set the field from list-array, in-place
+    def with_list_field(self, field: str, value: ArrayLike) -> pd.Series:
+        """Set the field from list-array of values and return a new series
 
         Parameters
         ----------
@@ -147,27 +154,37 @@ def set_list_field(self, field: str, value: ArrayLike) -> None:
         value : ArrayLike
             Array of values to set. It must be a list-array of the same length
              as the series, e.g. length of the series.
+
+        Returns
+        -------
+        pd.Series
+            The new series with the field set.
         """
-        self._series.array.set_list_field(field, value)
+        new_array = self._series.array.copy()
+        new_array.set_list_field(field, value)
+        return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)
+
+    def without_field(self, field: str | list[str]) -> pd.Series:
+        """Remove the field(s) from the series and return a new series
 
-    # I intentionally don't call it `drop` or `drop_field` because `pd.DataFrame.drop` is not inplace
-    # by default, and I wouldn't like to surprise the user.
-    def pop_field(self, field: str) -> pd.Series:
-        """Delete the field from the struct and return it.
+        Note, that at least one field must be left in the series.
 
         Parameters
         ----------
-        field : str
-            Name of the field to delete.
+        field : str or list[str]
+            Name of the field(s) to remove.
 
         Returns
         -------
         pd.Series
-            The deleted field.
+            The new series without the field(s).
         """
-        series = self[field]
-        self._series.array.pop_field(field)
-        return series
+        if isinstance(field, str):
+            field = [field]
+
+        new_array = self._series.array.copy()
+        new_array.pop_fields(field)
+        return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)
 
     def query_flat(self, query: str) -> pd.Series:
         """Query the flat arrays with a boolean expression
@@ -255,6 +272,12 @@ def __getitem__(self, key: str | list[str]) -> pd.Series:
         return self.get_flat_series(key)
 
     def __setitem__(self, key: str, value: ArrayLike) -> None:
+        """Replace the field values from flat-array of values
+
+        Currently, only replacement of the whole field is supported, the length
+        and dtype of the input value must match the field.
+        https://github.com/lincc-frameworks/nested-pandas/issues/87
+        """
         # TODO: we can be much-much smarter about the performance here
         # TODO: think better about underlying pa.ChunkArray in both self._series.array and value
 
@@ -268,7 +291,7 @@ def __setitem__(self, key: str, value: ArrayLike) -> None:
 
         # Set single value for all rows
         if ndim == 0:
-            self.set_flat_field(key, value)
+            self._series.array.set_flat_field(key, value, keep_dtype=True)
             return
 
         if isinstance(value, pd.Series) and not self.get_flat_index().equals(value.index):
@@ -284,13 +307,22 @@ def __setitem__(self, key: str, value: ArrayLike) -> None:
                 f"{len(self._series)}."
             )
 
-        self.set_flat_field(key, pa_array)
-
-    def __delitem__(self, key: str) -> None:
-        self.pop_field(key)
+        self._series.array.set_flat_field(key, pa_array, keep_dtype=True)
 
     def __iter__(self) -> Generator[str, None, None]:
-        yield from iter(self._series.array.field_names)
+        return iter(self._series.array.field_names)
 
     def __len__(self) -> int:
         return len(self._series.array.field_names)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return False
+        return self._series.equals(other._series)
+
+    def clear(self) -> None:
+        """Mandatory MutableMapping method, always fails with NotImplementedError
+
+        The reason is that we cannot delete all nested fields from the nested series.
+        """
+        raise NotImplementedError("Cannot delete fields from nested series")
diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -35,7 +35,7 @@
 # typing.Self and "|" union syntax don't exist in Python 3.9
 from __future__ import annotations
 
-from collections.abc import Iterator, Sequence
+from collections.abc import Iterable, Iterator, Sequence
 from typing import Any, Callable, cast
 
 import numpy as np
@@ -676,43 +676,94 @@ def view_fields(self, fields: str | list[str]) -> Self:  # type: ignore[name-def
 
         return self.__class__(pa_array, validate=False)
 
-    def set_flat_field(self, field: str, value: ArrayLike) -> None:
+    def set_flat_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = False) -> None:
         """Set the field from flat-array of values
 
+        Note that if this updates the dtype, it would not affect the dtype of
+        the pd.Series back-ended by this extension array.
+
         Parameters
         ----------
         field : str
             The name of the field.
         value : ArrayLike
             The 'flat' array of values to be set.
+        keep_dtype : bool, default False
+            Whether to keep the original dtype of the field. If True,
+            now new field will be created, and the dtype of the existing
+            field will be kept. If False, the dtype of the field will be
+            inferred from the input value.
         """
         # TODO: optimize for the case when the input is a pa.ChunkedArray
 
+        if keep_dtype:
+            if field not in self.field_names:
+                raise ValueError(
+                    "If keep_dtype is True, the field must exist in the series. "
+                    f"Got: {field}, available: {self.field_names}"
+                )
+            # Get the current element type of list-array
+            pa_type = self._pa_array.chunk(0).field(field).type.value_type
+        else:
+            pa_type = None
+
         if np.ndim(value) == 0:
             value = np.repeat(value, self.flat_length)
 
-        pa_array = pa.array(value)
+        try:
+            pa_array = pa.array(value, from_pandas=True, type=pa_type)
+        except (ValueError, TypeError) as e:
+            raise TypeError(
+                f"New values must be convertible to the existing element pyarrow type, {pa_type}. "
+                "If you want to replace field with values of a new type, use series.nest.with_flat_field() "
+                "or NestedExtensionArray.set_flat_field(..., keep_dtype=False) instead."
+            ) from e
 
         if len(pa_array) != self.flat_length:
             raise ValueError("The input must be a struct_scalar or have the same length as the flat arrays")
 
         list_array = pa.ListArray.from_arrays(values=pa_array, offsets=self.list_offsets)
 
-        return self.set_list_field(field, list_array)
+        return self.set_list_field(field, list_array, keep_dtype=keep_dtype)
 
-    def set_list_field(self, field: str, value: ArrayLike) -> None:
+    def set_list_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = False) -> None:
         """Set the field from list-array
 
+        Note that if this updates the dtype, it would not affect the dtype of
+        the pd.Series back-ended by this extension array.
+
         Parameters
         ----------
         field : str
             The name of the field.
         value : ArrayLike
             The list-array of values to be set.
+        keep_dtype : bool, default False
+            Whether to keep the original dtype of the field. If True,
+            now new field will be created, and the dtype of the existing
+            field will be kept. If False, the dtype of the field will be
+            inferred from the input value.
         """
         # TODO: optimize for the case when the input is a pa.ChunkedArray
 
-        pa_array = pa.array(value)
+        if keep_dtype:
+            if field not in self.field_names:
+                raise ValueError(
+                    "If keep_dtype is True, the field must exist in the series. "
+                    f"Got: {field}, available: {self.field_names}"
+                )
+            pa_type = self._pa_array.chunk(0).field(field).type
+        else:
+            pa_type = None
+
+        try:
+            pa_array = pa.array(value, from_pandas=True, type=pa_type)
+        except (ValueError, TypeError) as e:
+            raise TypeError(
+                f"New values must be convertible to the existing list pyarrow type, {pa_type}. "
+                "If you want to replace field with values of a new type, use series.nest.with_list_field() "
+                "or NestedExtensionArray.set_list_field(..., keep_dtype=False) instead."
+            ) from e
 
         if not is_pa_type_a_list(pa_array.type):
             raise ValueError(f"Expected a list array, got {pa_array.type}")
@@ -724,38 +775,42 @@ def set_list_field(self, field: str, value: ArrayLike) -> None:
         for sl, chunk in enumerate_chunks(self._pa_array):
             chunk = cast(pa.StructArray, chunk)
 
-            # Build a new struct array. We collect all existing fields and add the new one.
+            # Build a new struct array. We collect all existing fields and add/replace the new one.
             struct_dict = {}
             for pa_field in chunk.type:
                 struct_dict[pa_field.name] = chunk.field(pa_field.name)
-            struct_dict[field] = pa.array(pa_array[sl])
+            struct_dict[field] = pa_array[sl]
 
             struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys())
             chunks.append(struct_array)
-        pa_array = pa.chunked_array(chunks)
+        chunked_array = pa.chunked_array(chunks)
+
+        self._replace_pa_array(chunked_array, validate=True)
 
-        self._replace_pa_array(pa_array, validate=True)
+    def pop_fields(self, fields: Iterable[str]):
+        """Delete fields from the struct array
 
-    def pop_field(self, field: str):
-        """Delete a field from the struct array
+        Note that at least one field must be left in the struct array.
 
         Parameters
         ----------
-        field : str
-            The name of the field to be deleted.
+        fields : iterable of str
+            The names of the fields to delete.
         """
-        if field not in self.field_names:
-            raise ValueError(f"Field '{field}' not found")
+        fields = frozenset(fields)
+
+        if not fields.issubset(self.field_names):
+            raise ValueError(f"Some fields are not found, given: {fields}, available: {self.field_names}")
 
-        if len(self.field_names) == 1:
-            raise ValueError("Cannot delete the last field")
+        if len(self.field_names) - len(fields) == 0:
+            raise ValueError("Cannot delete all fields")
 
         chunks = []
         for chunk in self._pa_array.iterchunks():
             chunk = cast(pa.StructArray, chunk)
             struct_dict = {}
             for pa_field in chunk.type:
-                if pa_field.name != field:
+                if pa_field.name not in fields:
                     struct_dict[pa_field.name] = chunk.field(pa_field.name)
             struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys())
             chunks.append(struct_array)