From 5d9979fca7075b5eee78c340ac869c0b51d79cb4 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 29 May 2024 10:32:00 -0400 Subject: [PATCH] Fix low_level.ipynb --- docs/tutorials/low_level.ipynb | 198 ++++++--------------------- src/nested_pandas/series/accessor.py | 20 +++ 2 files changed, 63 insertions(+), 155 deletions(-) diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb index 7672662..307366c 100644 --- a/docs/tutorials/low_level.ipynb +++ b/docs/tutorials/low_level.ipynb @@ -17,12 +17,7 @@ "cell_type": "code", "execution_count": null, "id": "619f088e7ac0f327", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.641800Z", - "start_time": "2024-05-09T12:43:47.634903Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -49,12 +44,7 @@ "cell_type": "code", "execution_count": null, "id": "f9dd16a4bb9aaa63", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.708715Z", - "start_time": "2024-05-09T12:43:47.700005Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_df = generate_data(4, 3, seed=42)\n", @@ -83,20 +73,17 @@ "source": [ "### `.nest` object is a mapping\n", "\n", - "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like a dictionary.\n", - "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data." + "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like an immutable dictionary.\n", + "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data.\n", + "\n", + "The only way to modify the nested data in-place with this interface is to re-assign the whole field with a new data of the same length and dtype, see the discussion about the mutability limitations in [this GitHub issue](https://github.com/lincc-frameworks/nested-pandas/issues/87)." ] }, { "cell_type": "code", "execution_count": null, "id": "fb7beb750d3e2893", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.711893Z", - "start_time": "2024-05-09T12:43:47.709614Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "list(nested_series.nest.keys())" @@ -114,12 +101,7 @@ "cell_type": "code", "execution_count": null, "id": "56b0d9ffc5820d22", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.714235Z", - "start_time": "2024-05-09T12:43:47.712499Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest.fields" @@ -137,12 +119,7 @@ "cell_type": "code", "execution_count": null, "id": "30ee9a430b6ff641", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.717863Z", - "start_time": "2024-05-09T12:43:47.715368Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest[\"t\"]" @@ -160,12 +137,7 @@ "cell_type": "code", "execution_count": null, "id": "f0db15d31b289140", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.720405Z", - "start_time": "2024-05-09T12:43:47.718626Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest[[\"t\", \"flux\"]].dtype" @@ -177,7 +149,9 @@ "metadata": {}, "source": [ "You can add new columns, drop existing ones, or modify the existing ones.\n", - "The modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length.\n", + "These operations would create new nested Series, however they would create shallow copies of the rest of the fields, so they are quite efficient.\n", + "\n", + "The in-place modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length and compatible dtype.\n", "When modifying the nested data, only the column you are working with is changed, the rest of the data are not affected and not copied." ] }, @@ -185,12 +159,7 @@ "cell_type": "code", "execution_count": null, "id": "66ae5cc26fa17458", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.726619Z", - "start_time": "2024-05-09T12:43:47.721070Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "new_series = nested_series.copy()\n", @@ -198,14 +167,20 @@ "# Change the data in-place\n", "new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n", "\n", - "# Add new column\n", - "new_series.nest[\"lsst_band\"] = \"lsst_\" + new_series.nest[\"band\"]\n", + "# Create a new series with a new column\n", + "new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n", "\n", - "# Drop the column, .pop() method is also available\n", - "del new_series.nest[\"band\"]\n", + "# Create a new series with a column removed, you can also pass a list of columns to remove\n", + "new_series = new_series.nest.without_field(\"band\")\n", "\n", "# Add a new column with a python list instead of a Series\n", - "new_series.nest[\"new_column\"] = [1, 2] * (new_series.nest.flat_length // 2)\n", + "new_series = new_series.nest.with_field(\n", + " \"new_column\",\n", + " [1, 2] * (new_series.nest.flat_length // 2),\n", + ")\n", + "\n", + "# Create a new series, with a column dtype changed\n", + "new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n", "\n", "new_series.nest.to_flat()" ] @@ -228,12 +203,7 @@ "cell_type": "code", "execution_count": null, "id": "ce6d519d8d37ead3", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.768616Z", - "start_time": "2024-05-09T12:43:47.764343Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest.to_flat([\"flux\", \"t\"])" @@ -243,12 +213,7 @@ "cell_type": "code", "execution_count": null, "id": "2421b91387487995", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.798697Z", - "start_time": "2024-05-09T12:43:47.795583Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "lists_df = nested_series.nest.to_lists() # may also accept a list of fields (nested columns) to get\n", @@ -267,19 +232,12 @@ "cell_type": "code", "execution_count": null, "id": "f2c205e95affb9ba", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.833034Z", - "start_time": "2024-05-09T12:43:47.827805Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "new_series = nested_series.copy()\n", - "\n", "# Adjust each time to be relative to the first observation\n", "dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n", - "new_series.nest.set_list_field(\"dt\", dt)\n", + "new_series = new_series.nest.with_list_field(\"dt\", dt)\n", "new_series.nest.to_flat()" ] }, @@ -313,12 +271,7 @@ "cell_type": "code", "execution_count": null, "id": "8ef96243c6d74aff", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.875752Z", - "start_time": "2024-05-09T12:43:47.872293Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "struct_series = pd.Series(nested_series, dtype=nested_series.dtype.to_pandas_arrow_dtype())\n", @@ -329,12 +282,7 @@ "cell_type": "code", "execution_count": null, "id": "422e719861ae40f6", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.925465Z", - "start_time": "2024-05-09T12:43:47.922965Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))" @@ -364,12 +312,7 @@ "cell_type": "code", "execution_count": null, "id": "926f2c9fcffc5f03", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.937490Z", - "start_time": "2024-05-09T12:43:47.933878Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "new_series = pack(nested_series.nest.to_flat())\n", @@ -380,12 +323,7 @@ "cell_type": "code", "execution_count": null, "id": "3a1d2025c232ac82", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.969831Z", - "start_time": "2024-05-09T12:43:47.964948Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_flat = pack(\n", @@ -422,12 +360,7 @@ "cell_type": "code", "execution_count": null, "id": "2de4619726ab3d5c", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.991261Z", - "start_time": "2024-05-09T12:43:47.986129Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_pack = pack(\n", @@ -454,12 +387,7 @@ "cell_type": "code", "execution_count": null, "id": "9c63ae45dd0b6a29", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.995869Z", - "start_time": "2024-05-09T12:43:47.992016Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_pack = pack(\n", @@ -500,12 +428,7 @@ "cell_type": "code", "execution_count": null, "id": "1284d9b536b9e784", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.000441Z", - "start_time": "2024-05-09T12:43:47.996620Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_dtype = pd.Series(\n", @@ -531,12 +454,7 @@ "cell_type": "code", "execution_count": null, "id": "b7c7fd878bc97f68", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.004677Z", - "start_time": "2024-05-09T12:43:48.001129Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_pa_type = pa.struct({\"t\": pa.list_(pa.float64()), \"band\": pa.list_(pa.string())})\n", @@ -568,12 +486,7 @@ "cell_type": "code", "execution_count": null, "id": "e837d25dcb0a2b4d", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.015257Z", - "start_time": "2024-05-09T12:43:48.013217Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "pa_struct_array = pa.StructArray.from_arrays(\n", @@ -611,12 +524,7 @@ "cell_type": "code", "execution_count": null, "id": "116c902ea8681c9e", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.040801Z", - "start_time": "2024-05-09T12:43:48.038106Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Convert to pd.ArrowDtype Series of struct-arrays\n", @@ -641,12 +549,7 @@ "cell_type": "code", "execution_count": null, "id": "30ea40dee30795d1", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.055678Z", - "start_time": "2024-05-09T12:43:48.050677Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "for element in nested_series:\n", @@ -665,12 +568,7 @@ "cell_type": "code", "execution_count": null, "id": "81f6c1f98dfc26a9", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.060166Z", - "start_time": "2024-05-09T12:43:48.056425Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_elements = list(nested_series)\n", @@ -689,12 +587,7 @@ "cell_type": "code", "execution_count": null, "id": "69ed758c48c55015", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.063115Z", - "start_time": "2024-05-09T12:43:48.060863Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series_with_na = pack([None, pd.NA, {\"t\": [1, 2], \"flux\": [0.1, None]}])\n", @@ -707,12 +600,7 @@ "cell_type": "code", "execution_count": null, "id": "99ce9d18bc69ae49", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.088986Z", - "start_time": "2024-05-09T12:43:48.086255Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Would have empty pd.DataFrame for top-level missed data\n", diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 6ff1912..4cd67fd 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -124,6 +124,26 @@ def fields(self) -> list[str]: """Names of the nested columns""" return self._series.array.field_names + def with_field(self, field: str, value: ArrayLike) -> pd.Series: + """Set the field from flat-array of values and return a new series + + It is an alias for `.nest.with_flat_field`. + + Parameters + ---------- + field : str + Name of the field to set. If not present, it will be added. + value : ArrayLike + Array of values to set. It must be a scalar or have the same length + as the flat arrays, e.g. `self.flat_length`. + + Returns + ------- + pd.Series + The new series with the field set. + """ + return self.with_flat_field(field, value) + def with_flat_field(self, field: str, value: ArrayLike) -> pd.Series: """Set the field from flat-array of values and return a new series