Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for accessor mapping methods #91

Merged
merged 3 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 43 additions & 155 deletions docs/tutorials/low_level.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,7 @@
"cell_type": "code",
"execution_count": null,
"id": "619f088e7ac0f327",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.641800Z",
"start_time": "2024-05-09T12:43:47.634903Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
Expand All @@ -49,12 +44,7 @@
"cell_type": "code",
"execution_count": null,
"id": "f9dd16a4bb9aaa63",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.708715Z",
"start_time": "2024-05-09T12:43:47.700005Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_df = generate_data(4, 3, seed=42)\n",
Expand Down Expand Up @@ -83,20 +73,17 @@
"source": [
"### `.nest` object is a mapping\n",
"\n",
"`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like a dictionary.\n",
"Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data."
"`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like an immutable dictionary.\n",
"Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data.\n",
"\n",
"The only way to modify the nested data in-place with this interface is to re-assign the whole field with a new data of the same length and dtype, see the discussion about the mutability limitations in [this GitHub issue](https://github.com/lincc-frameworks/nested-pandas/issues/87)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb7beb750d3e2893",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.711893Z",
"start_time": "2024-05-09T12:43:47.709614Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"list(nested_series.nest.keys())"
Expand All @@ -114,12 +101,7 @@
"cell_type": "code",
"execution_count": null,
"id": "56b0d9ffc5820d22",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.714235Z",
"start_time": "2024-05-09T12:43:47.712499Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest.fields"
Expand All @@ -137,12 +119,7 @@
"cell_type": "code",
"execution_count": null,
"id": "30ee9a430b6ff641",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.717863Z",
"start_time": "2024-05-09T12:43:47.715368Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest[\"t\"]"
Expand All @@ -160,12 +137,7 @@
"cell_type": "code",
"execution_count": null,
"id": "f0db15d31b289140",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.720405Z",
"start_time": "2024-05-09T12:43:47.718626Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest[[\"t\", \"flux\"]].dtype"
Expand All @@ -177,35 +149,38 @@
"metadata": {},
"source": [
"You can add new columns, drop existing ones, or modify the existing ones.\n",
"The modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length.\n",
"These operations would create new nested Series, however they would create shallow copies of the rest of the fields, so they are quite efficient.\n",
"\n",
"The in-place modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length and compatible dtype.\n",
"When modifying the nested data, only the column you are working with is changed, the rest of the data are not affected and not copied."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66ae5cc26fa17458",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.726619Z",
"start_time": "2024-05-09T12:43:47.721070Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"new_series = nested_series.copy()\n",
"\n",
"# Change the data in-place\n",
"new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n",
"\n",
"# Add new column\n",
"new_series.nest[\"lsst_band\"] = \"lsst_\" + new_series.nest[\"band\"]\n",
"# Create a new series with a new column\n",
"new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n",
"\n",
"# Drop the column, .pop() method is also available\n",
"del new_series.nest[\"band\"]\n",
"# Create a new series with a column removed, you can also pass a list of columns to remove\n",
"new_series = new_series.nest.without_field(\"band\")\n",
"\n",
"# Add a new column with a python list instead of a Series\n",
"new_series.nest[\"new_column\"] = [1, 2] * (new_series.nest.flat_length // 2)\n",
"new_series = new_series.nest.with_field(\n",
" \"new_column\",\n",
" [1, 2] * (new_series.nest.flat_length // 2),\n",
")\n",
"\n",
"# Create a new series, with a column dtype changed\n",
"new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n",
"\n",
"new_series.nest.to_flat()"
]
Expand All @@ -228,12 +203,7 @@
"cell_type": "code",
"execution_count": null,
"id": "ce6d519d8d37ead3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.768616Z",
"start_time": "2024-05-09T12:43:47.764343Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest.to_flat([\"flux\", \"t\"])"
Expand All @@ -243,12 +213,7 @@
"cell_type": "code",
"execution_count": null,
"id": "2421b91387487995",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.798697Z",
"start_time": "2024-05-09T12:43:47.795583Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"lists_df = nested_series.nest.to_lists() # may also accept a list of fields (nested columns) to get\n",
Expand All @@ -267,19 +232,12 @@
"cell_type": "code",
"execution_count": null,
"id": "f2c205e95affb9ba",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.833034Z",
"start_time": "2024-05-09T12:43:47.827805Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"new_series = nested_series.copy()\n",
"\n",
"# Adjust each time to be relative to the first observation\n",
"dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n",
"new_series.nest.set_list_field(\"dt\", dt)\n",
"new_series = new_series.nest.with_list_field(\"dt\", dt)\n",
"new_series.nest.to_flat()"
]
},
Expand Down Expand Up @@ -313,12 +271,7 @@
"cell_type": "code",
"execution_count": null,
"id": "8ef96243c6d74aff",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.875752Z",
"start_time": "2024-05-09T12:43:47.872293Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"struct_series = pd.Series(nested_series, dtype=nested_series.dtype.to_pandas_arrow_dtype())\n",
Expand All @@ -329,12 +282,7 @@
"cell_type": "code",
"execution_count": null,
"id": "422e719861ae40f6",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.925465Z",
"start_time": "2024-05-09T12:43:47.922965Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))"
Expand Down Expand Up @@ -364,12 +312,7 @@
"cell_type": "code",
"execution_count": null,
"id": "926f2c9fcffc5f03",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.937490Z",
"start_time": "2024-05-09T12:43:47.933878Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"new_series = pack(nested_series.nest.to_flat())\n",
Expand All @@ -380,12 +323,7 @@
"cell_type": "code",
"execution_count": null,
"id": "3a1d2025c232ac82",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.969831Z",
"start_time": "2024-05-09T12:43:47.964948Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_flat = pack(\n",
Expand Down Expand Up @@ -422,12 +360,7 @@
"cell_type": "code",
"execution_count": null,
"id": "2de4619726ab3d5c",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.991261Z",
"start_time": "2024-05-09T12:43:47.986129Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_pack = pack(\n",
Expand All @@ -454,12 +387,7 @@
"cell_type": "code",
"execution_count": null,
"id": "9c63ae45dd0b6a29",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.995869Z",
"start_time": "2024-05-09T12:43:47.992016Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_pack = pack(\n",
Expand Down Expand Up @@ -500,12 +428,7 @@
"cell_type": "code",
"execution_count": null,
"id": "1284d9b536b9e784",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.000441Z",
"start_time": "2024-05-09T12:43:47.996620Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_dtype = pd.Series(\n",
Expand All @@ -531,12 +454,7 @@
"cell_type": "code",
"execution_count": null,
"id": "b7c7fd878bc97f68",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.004677Z",
"start_time": "2024-05-09T12:43:48.001129Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_pa_type = pa.struct({\"t\": pa.list_(pa.float64()), \"band\": pa.list_(pa.string())})\n",
Expand Down Expand Up @@ -568,12 +486,7 @@
"cell_type": "code",
"execution_count": null,
"id": "e837d25dcb0a2b4d",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.015257Z",
"start_time": "2024-05-09T12:43:48.013217Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"pa_struct_array = pa.StructArray.from_arrays(\n",
Expand Down Expand Up @@ -611,12 +524,7 @@
"cell_type": "code",
"execution_count": null,
"id": "116c902ea8681c9e",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.040801Z",
"start_time": "2024-05-09T12:43:48.038106Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"# Convert to pd.ArrowDtype Series of struct-arrays\n",
Expand All @@ -641,12 +549,7 @@
"cell_type": "code",
"execution_count": null,
"id": "30ea40dee30795d1",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.055678Z",
"start_time": "2024-05-09T12:43:48.050677Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"for element in nested_series:\n",
Expand All @@ -665,12 +568,7 @@
"cell_type": "code",
"execution_count": null,
"id": "81f6c1f98dfc26a9",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.060166Z",
"start_time": "2024-05-09T12:43:48.056425Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_elements = list(nested_series)\n",
Expand All @@ -689,12 +587,7 @@
"cell_type": "code",
"execution_count": null,
"id": "69ed758c48c55015",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.063115Z",
"start_time": "2024-05-09T12:43:48.060863Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series_with_na = pack([None, pd.NA, {\"t\": [1, 2], \"flux\": [0.1, None]}])\n",
Expand All @@ -707,12 +600,7 @@
"cell_type": "code",
"execution_count": null,
"id": "99ce9d18bc69ae49",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.088986Z",
"start_time": "2024-05-09T12:43:48.086255Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"# Would have empty pd.DataFrame for top-level missed data\n",
Expand Down
Loading