Fix low_level.ipynb

lincc-frameworks · May 29, 2024 · 5d9979f · 5d9979f
1 parent d86f37e
commit 5d9979f
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 155 deletions.
diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb
@@ -17,12 +17,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "619f088e7ac0f327",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.641800Z",
-     "start_time": "2024-05-09T12:43:47.634903Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -49,12 +44,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "f9dd16a4bb9aaa63",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.708715Z",
-     "start_time": "2024-05-09T12:43:47.700005Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_df = generate_data(4, 3, seed=42)\n",
@@ -83,20 +73,17 @@
    "source": [
     "### `.nest` object is a mapping\n",
     "\n",
-    "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like a dictionary.\n",
-    "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data."
+    "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like an immutable dictionary.\n",
+    "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data.\n",
+    "\n",
+    "The only way to modify the nested data in-place with this interface is to re-assign the whole field with a new data of the same length and dtype, see the discussion about the mutability limitations in [this GitHub issue](https://github.com/lincc-frameworks/nested-pandas/issues/87)."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "fb7beb750d3e2893",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.711893Z",
-     "start_time": "2024-05-09T12:43:47.709614Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "list(nested_series.nest.keys())"
@@ -114,12 +101,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "56b0d9ffc5820d22",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.714235Z",
-     "start_time": "2024-05-09T12:43:47.712499Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_series.nest.fields"
@@ -137,12 +119,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "30ee9a430b6ff641",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.717863Z",
-     "start_time": "2024-05-09T12:43:47.715368Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_series.nest[\"t\"]"
@@ -160,12 +137,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "f0db15d31b289140",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.720405Z",
-     "start_time": "2024-05-09T12:43:47.718626Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_series.nest[[\"t\", \"flux\"]].dtype"
@@ -177,35 +149,38 @@
    "metadata": {},
    "source": [
     "You can add new columns, drop existing ones, or modify the existing ones.\n",
-    "The modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length.\n",
+    "These operations would create new nested Series, however they would create shallow copies of the rest of the fields, so they are quite efficient.\n",
+    "\n",
+    "The in-place modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length and compatible dtype.\n",
     "When modifying the nested data, only the column you are working with is changed, the rest of the data are not affected and not copied."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "66ae5cc26fa17458",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.726619Z",
-     "start_time": "2024-05-09T12:43:47.721070Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "new_series = nested_series.copy()\n",
     "\n",
     "# Change the data in-place\n",
     "new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n",
     "\n",
-    "# Add new column\n",
-    "new_series.nest[\"lsst_band\"] = \"lsst_\" + new_series.nest[\"band\"]\n",
+    "# Create a new series with a new column\n",
+    "new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n",
     "\n",
-    "# Drop the column, .pop() method is also available\n",
-    "del new_series.nest[\"band\"]\n",
+    "# Create a new series with a column removed, you can also pass a list of columns to remove\n",
+    "new_series = new_series.nest.without_field(\"band\")\n",
     "\n",
     "# Add a new column with a python list instead of a Series\n",
-    "new_series.nest[\"new_column\"] = [1, 2] * (new_series.nest.flat_length // 2)\n",
+    "new_series = new_series.nest.with_field(\n",
+    "    \"new_column\",\n",
+    "    [1, 2] * (new_series.nest.flat_length // 2),\n",
+    ")\n",
+    "\n",
+    "# Create a new series, with a column dtype changed\n",
+    "new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n",
     "\n",
     "new_series.nest.to_flat()"
    ]
@@ -228,12 +203,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "ce6d519d8d37ead3",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.768616Z",
-     "start_time": "2024-05-09T12:43:47.764343Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_series.nest.to_flat([\"flux\", \"t\"])"
@@ -243,12 +213,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "2421b91387487995",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.798697Z",
-     "start_time": "2024-05-09T12:43:47.795583Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "lists_df = nested_series.nest.to_lists()  # may also accept a list of fields (nested columns) to get\n",
@@ -267,19 +232,12 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "f2c205e95affb9ba",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.833034Z",
-     "start_time": "2024-05-09T12:43:47.827805Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "new_series = nested_series.copy()\n",
-    "\n",
     "# Adjust each time to be relative to the first observation\n",
     "dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n",
-    "new_series.nest.set_list_field(\"dt\", dt)\n",
+    "new_series = new_series.nest.with_list_field(\"dt\", dt)\n",
     "new_series.nest.to_flat()"
    ]
   },
@@ -313,12 +271,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "8ef96243c6d74aff",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.875752Z",
-     "start_time": "2024-05-09T12:43:47.872293Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "struct_series = pd.Series(nested_series, dtype=nested_series.dtype.to_pandas_arrow_dtype())\n",
@@ -329,12 +282,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "422e719861ae40f6",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.925465Z",
-     "start_time": "2024-05-09T12:43:47.922965Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))"
@@ -364,12 +312,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "926f2c9fcffc5f03",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.937490Z",
-     "start_time": "2024-05-09T12:43:47.933878Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "new_series = pack(nested_series.nest.to_flat())\n",
@@ -380,12 +323,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "3a1d2025c232ac82",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.969831Z",
-     "start_time": "2024-05-09T12:43:47.964948Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "series_from_flat = pack(\n",
@@ -422,12 +360,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "2de4619726ab3d5c",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.991261Z",
-     "start_time": "2024-05-09T12:43:47.986129Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "series_from_pack = pack(\n",
@@ -454,12 +387,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "9c63ae45dd0b6a29",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:47.995869Z",
-     "start_time": "2024-05-09T12:43:47.992016Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "series_from_pack = pack(\n",
@@ -500,12 +428,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "1284d9b536b9e784",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.000441Z",
-     "start_time": "2024-05-09T12:43:47.996620Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "series_from_dtype = pd.Series(\n",
@@ -531,12 +454,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "b7c7fd878bc97f68",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.004677Z",
-     "start_time": "2024-05-09T12:43:48.001129Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "series_pa_type = pa.struct({\"t\": pa.list_(pa.float64()), \"band\": pa.list_(pa.string())})\n",
@@ -568,12 +486,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "e837d25dcb0a2b4d",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.015257Z",
-     "start_time": "2024-05-09T12:43:48.013217Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "pa_struct_array = pa.StructArray.from_arrays(\n",
@@ -611,12 +524,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "116c902ea8681c9e",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.040801Z",
-     "start_time": "2024-05-09T12:43:48.038106Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Convert to pd.ArrowDtype Series of struct-arrays\n",
@@ -641,12 +549,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "30ea40dee30795d1",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.055678Z",
-     "start_time": "2024-05-09T12:43:48.050677Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "for element in nested_series:\n",
@@ -665,12 +568,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "81f6c1f98dfc26a9",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.060166Z",
-     "start_time": "2024-05-09T12:43:48.056425Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_elements = list(nested_series)\n",
@@ -689,12 +587,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "69ed758c48c55015",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.063115Z",
-     "start_time": "2024-05-09T12:43:48.060863Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "nested_series_with_na = pack([None, pd.NA, {\"t\": [1, 2], \"flux\": [0.1, None]}])\n",
@@ -707,12 +600,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "99ce9d18bc69ae49",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-05-09T12:43:48.088986Z",
-     "start_time": "2024-05-09T12:43:48.086255Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Would have empty pd.DataFrame for top-level missed data\n",