From 78796eb90dff217c0af582fc41a83987cce78266 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Wed, 9 Oct 2024 08:38:25 -0700 Subject: [PATCH] Use custom resolver for query and eval with nested frames. Verify preflighting of nested expressions using AST visitation. Remove logic for splitting queries by string. Now the evaluation is handled by a nested column resolver, and the mixed-mode expressions are preflighted by examining the parsed abstract syntax tree for the query expression. --- .gitignore | 3 + docs/tutorials/data_loading_notebook.ipynb | 16 +- docs/tutorials/data_manipulation.ipynb | 4 +- docs/tutorials/low_level.ipynb | 18 +- docs/tutorials/nested_spectra.ipynb | 10 +- src/nested_pandas/nestedframe/core.py | 237 ++++++++++++++---- src/nested_pandas/nestedframe/utils.py | 39 +++ .../nestedframe/test_nestedframe.py | 42 ++++ .../nestedframe/test_nestedframe_utils.py | 17 -- tests/nested_pandas/utils/test_utils.py | 27 ++ 10 files changed, 320 insertions(+), 93 deletions(-) delete mode 100644 tests/nested_pandas/nestedframe/test_nestedframe_utils.py diff --git a/.gitignore b/.gitignore index 50990fe..dfe2e99 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,9 @@ dmypy.json # vscode .vscode/ +# PyCharm +.idea/ + # dask dask-worker-space/ diff --git a/docs/tutorials/data_loading_notebook.ipynb b/docs/tutorials/data_loading_notebook.ipynb index 6b54f86..8aa5e62 100644 --- a/docs/tutorials/data_loading_notebook.ipynb +++ b/docs/tutorials/data_loading_notebook.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With a valid Python environment, nested-pandas and it's dependencies are easy to install using the `pip` package manager. The following command can be used to install it:" + "With a valid Python environment, nested-pandas and its dependencies are easy to install using the `pip` package manager. The following command can be used to install it:" ] }, { @@ -47,7 +47,7 @@ "\n", "We can use the `NestedFrame` constructor to create our base frame from a dictionary of our columns.\n", "\n", - "We can then create an addtional pandas dataframes and pack them into our `NestedFrame` with `NestedFrame.add_nested`" + "We can then create an addtional pandas dataframes and pack them into our `NestedFrame` with `NestedFrame.add_nested`." ] }, { @@ -97,7 +97,7 @@ "# Note: that we use the `tempfile` module to create and then cleanup a temporary directory.\n", "# You can of course remove this and use your own directory and real files on your system.\n", "with tempfile.TemporaryDirectory() as temp_path:\n", - " # Generates parquet files with random data within our temporary directorye.\n", + " # Generates parquet files with random data within our temporary directory.\n", " generate_parquet_file(10, {\"nested1\": 100, \"nested2\": 10}, temp_path, file_per_layer=True)\n", "\n", " # Read each individual parquet file into its own dataframe.\n", @@ -148,7 +148,7 @@ "source": [ "So inspect `nf`, a `NestedFrame` we created from our call to `read_parquet` with the `to_pack` argument, we're able to pack nested parquet files according to the shared index values with the index in `base.parquet`.\n", "\n", - "The resulting `NestedFrame` having the same number of rows as `base.parquet` and with `nested1.parquet` and `nested2.parquet` packed into the 'nested1' and 'nested2' columns respectively." + "The resulting `NestedFrame` having the same number of rows as `base.parquet` and with `nested1.parquet` and `nested2.parquet` packed into the `nested1` and `nested2` columns respectively." ] }, { @@ -164,7 +164,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since we loaded each individual parquet file into its own dataframe, we can also verify that using `read_parquet` with the `to_pack` argument is equivalent to the following method of packing the dataframes directly with `NestedFrame.add_nested`" + "Since we loaded each individual parquet file into its own dataframe, we can also verify that using `read_parquet` with the `to_pack` argument is equivalent to the following method of packing the dataframes directly with `NestedFrame.add_nested`." ] }, { @@ -189,11 +189,11 @@ "source": [ "# Saving NestedFrames to Parquet Files\n", "\n", - "Additionally we can save an existing `NestedFrame` as one of more parquet files using `NestedFrame.to_parquet``\n", + "Additionally we can save an existing `NestedFrame` as one of more parquet files using `NestedFrame.to_parquet`.\n", "\n", "When `by_layer=True` we save each individual layer of the NestedFrame into its own parquet file in a specified output directory.\n", "\n", - "The base layer will be outputted to \"base.parquet\", and each nested layer will be written to a file based on its column name. So the nested layer in column `nested1` will be written to \"nested1.parquet\"." + "The base layer will be outputted to `base.parquet`, and each nested layer will be written to a file based on its column name. So the nested layer in column `nested1` will be written to `nested1.parquet`." ] }, { @@ -233,7 +233,7 @@ "source": [ "We also support saving a `NestedFrame` as a single parquet file where the packed layers are still packed in their respective columns.\n", "\n", - "Here we provide `NestedFrame.to_parquet` with the desired path of the *single* output file (rather than the path of a directory to store *multiple* output files) and use `per_layer=False'\n", + "Here we provide `NestedFrame.to_parquet` with the desired path of the *single* output file (rather than the path of a directory to store *multiple* output files) and use `per_layer=False`.\n", "\n", "Our `read_parquet` function can load a `NestedFrame` saved in this single file parquet without requiring any additional arguments. " ] diff --git a/docs/tutorials/data_manipulation.ipynb b/docs/tutorials/data_manipulation.ipynb index 941de7a..a0aa822 100644 --- a/docs/tutorials/data_manipulation.ipynb +++ b/docs/tutorials/data_manipulation.ipynb @@ -49,7 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, we can directly fetch a column from our nested column (aptly called \"nested\"). For example, below we can fetch the time column, \"t\", by specifying `\"nested.t\"` as the column to retrieve. This returns a \"flat\" view of the nested t column, where all rows from all dataframes are present in one dataframe." + "First, we can directly fetch a column from our nested column (aptly called \"nested\"). For example, below we can fetch the time column, \"t\", by specifying `\"nested.t\"` as the column to retrieve. This returns a \"flat\" view of the nested `t` column, where all rows from all dataframes are present in one dataframe." ] }, { @@ -170,7 +170,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This is functionally equivalent to using `add_nested`" + "This is functionally equivalent to using `add_nested`:" ] }, { diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb index 307366c..02f815b 100644 --- a/docs/tutorials/low_level.ipynb +++ b/docs/tutorials/low_level.ipynb @@ -8,7 +8,7 @@ "# Lower-level interface for performance and flexibility\n", "## Reveal the hidden power of nested Series\n", "\n", - "This section is for users looking to optimize the performance, both computationally and in memory-usage, of their workflows. This section also details a broader suite of data representations usable within `nested-pandas`.\n", + "This section is for users looking to optimize both the compute and memory performance of their workflows. This section also details a broader suite of data representations usable within `nested-pandas`.\n", "It shows how to deal with individual nested columns: add, remove, and modify data using both \"flat-array\" and \"list-array\" representations.\n", "It also demonstrates how to convert nested Series to and from different data types, like `pd.ArrowDtype`d Series, flat dataframes, list-array dataframes, and collections of nested elements." ] @@ -36,7 +36,7 @@ "source": [ "## Generate some data and get a Series of `NestedDtype` type\n", "\n", - "We are going to use built-in data generator to get a `NestedFrame` with a \"nested\" column being a `Series` of `NestedDtype` type.\n", + "We are going to use the built-in data generator to get a `NestedFrame` with a \"nested\" column being a `Series` of `NestedDtype` type.\n", "This column would represent [light curves](https://en.wikipedia.org/wiki/Light_curve) of some astronomical objects. " ] }, @@ -94,7 +94,7 @@ "id": "33d8caacf0bf042e", "metadata": {}, "source": [ - "You can also get a list of fields with `.fields` attribute" + "You can also get a list of fields with `.fields` attribute:" ] }, { @@ -130,7 +130,7 @@ "id": "7167f5a9c947d96f", "metadata": {}, "source": [ - "You can also get a subset of nested columns as a new nested Series" + "You can also get a subset of nested columns as a new nested Series:" ] }, { @@ -479,7 +479,7 @@ "source": [ "#### pd.Series from an array\n", "\n", - "Construction with `pyarrow` struct arrays is the cheapest way to create a nested Series. It is very semilliar to initialisation of a `pd.Series` of `pd.ArrowDtype` type." + "Construction with `pyarrow` struct arrays is the cheapest way to create a nested Series. It is very similar to the initialization of a `pd.Series` of `pd.ArrowDtype` type." ] }, { @@ -611,21 +611,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.6" } }, "nbformat": 4, diff --git a/docs/tutorials/nested_spectra.ipynb b/docs/tutorials/nested_spectra.ipynb index d311655..8d2fffd 100644 --- a/docs/tutorials/nested_spectra.ipynb +++ b/docs/tutorials/nested_spectra.ipynb @@ -79,7 +79,7 @@ "flux = np.array([])\n", "err = np.array([])\n", "index = np.array([])\n", - "# Loop over each spectrum, adding it's data to the arrays\n", + "# Loop over each spectrum, adding its data to the arrays\n", "for i, hdu in enumerate(sp):\n", " wave = np.append(wave, 10 ** hdu[\"COADD\"].data.loglam) # * u.angstrom\n", " flux = np.append(flux, hdu[\"COADD\"].data.flux * 1e-17) # * u.erg/u.second/u.centimeter**2/u.angstrom\n", @@ -115,7 +115,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And we can see that each object now has the \"coadd_spectrum\" nested column with the full spectrum available." + "And we can see that each object now has the `coadd_spectrum` nested column with the full spectrum available." ] }, { @@ -161,7 +161,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -175,9 +175,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.6" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index c244c0a..ff05ff4 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -13,7 +13,26 @@ from nested_pandas.series import packer from nested_pandas.series.dtype import NestedDtype -from .utils import _ensure_spacing +from ..series.packer import pack_sorted_df_into_struct +from .utils import NestingType, check_expr_nesting + + +class NestedSeries(pd.Series): + """ + Series that were unpacked from a nest. + """ + + _metadata = ["nest_name", "flat_nest"] + + @property + def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 + return NestedSeries + + @property + def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 + return NestedSeries + + __pandas_priority__ = 3500 class NestedFrame(pd.DataFrame): @@ -22,8 +41,7 @@ class NestedFrame(pd.DataFrame): See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures """ - # normal properties - _metadata = ["added_property"] + __pandas_priority__ = 4500 @property def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 @@ -71,7 +89,7 @@ def __getitem__(self, item): """Adds custom __getitem__ behavior for nested columns""" if isinstance(item, str): - # Pre-empt the nested check if the item is a base column + # Preempt the nested check if the item is a base column if item in self.columns: return super().__getitem__(item) # If a nested column name is passed, return a flat series for that column @@ -289,38 +307,110 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): else: return NestedFrame(packed_df.to_frame()) - def _split_query(self, expr) -> dict: - """Splits a pandas query into multiple subqueries for nested and base layers""" - # Ensure query has needed spacing for upcoming split - expr = _ensure_spacing(expr) - nest_exprs = {col: [] for col in self.nested_columns + ["base"]} # type: dict - split_expr = expr.split(" ") - - i = 0 - current_focus = "base" - while i < len(split_expr): - expr_slice = split_expr[i].strip("()") - # Check if it's a nested column - if self._is_known_hierarchical_column(expr_slice): - nested, colname = split_expr[i].split(".") - current_focus = nested.strip("()") - # account for parentheses - j = 0 - while j < len(nested): - if nested[j] == "(": - nest_exprs[current_focus].append("(") - j += 1 - nest_exprs[current_focus].append(colname) - # or if it's a top-level column - elif expr_slice in self.columns: - current_focus = "base" - nest_exprs[current_focus].append(split_expr[i]) - else: - nest_exprs[current_focus].append(split_expr[i]) - i += 1 - return {expr: " ".join(nest_exprs[expr]) for expr in nest_exprs if len(nest_exprs[expr]) > 0} + def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: + """ + + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Works the same way as `pd.DataFrame.eval`, except that this method + will also automatically unpack nested columns into NestedSeries, + and the resulting expression will have the dimensions of the unpacked + series. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.assign : Can evaluate an expression or function to create new + values for a column. + eval : Evaluate a Python expression as a string using various + backends. - def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + nested_resolvers = self._get_nested_column_resolvers() + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (nested_resolvers,) + kwargs["inplace"] = inplace + return super().eval(expr, **kwargs) + + def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None: """ Query the columns of a NestedFrame with a boolean expression. Specified queries can target nested columns in addition to the typical column set @@ -348,6 +438,12 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. + inplace : bool + Whether to modify the DataFrame rather than creating a new one. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + Returns ------- DataFrame @@ -363,25 +459,62 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 >>> df.query("mynested.a > 2") """ - - # Rebuild queries for each specified nested/base layer - exprs_to_use = self._split_query(expr) - - # For now (simplicity), limit query to only operating on one layer - if len(exprs_to_use.keys()) != 1: + if not isinstance(expr, str): + msg = f"expr must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None + # At present, the query expression must be either entirely within the + # nested namespace or the base namespace. Mixed structures are not + # supported, so preflight the expression. + nesting_types = check_expr_nesting(expr) + if NestingType.NESTED in nesting_types and NestingType.BASE in nesting_types: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") - - # Send queries to layers - # We'll only execute 1 per the Error above, but the loop will be useful - # for when/if we allow multi-layer queries - result = self.copy() - for expr in exprs_to_use: - if expr == "base": - result = super().query(exprs_to_use["base"], inplace=False) + result = self.eval(expr, **kwargs) + # If the result is a NestedSeries, then the evaluation has caused unpacking, + # which means that a nested attribute was referenced. Apply this result + # to the nest and repack. Otherwise, apply it to this instance as usual, + # since it operated on the base attributes. + try: + if isinstance(result, NestedSeries): + nest_name, flat_nest = result.nest_name, result.flat_nest + new_flat_nest = flat_nest.loc[result] + result = self.copy() + result[nest_name] = pack_sorted_df_into_struct(new_flat_nest) else: - # TODO: does not work with queries that empty the dataframe - result[expr] = result[expr].nest.query_flat(exprs_to_use[expr]) - return result + result = self.loc[result] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + result = self[result] + + if inplace: + self._update_inplace(result) + return None + else: + return result + + def _get_nested_column_resolvers(self): + class NestResolver: + def __init__(self, nest_name: str, outer): + self._nest_name = nest_name + # Save the outer frame with an eye toward repacking. + self._outer = outer + # Flattened only once for every access of this particular nest + # within the expression. + self._flat_nest = outer[nest_name].nest.to_flat() + + def __getattr__(self, item_name: str): + if item_name in self._flat_nest: + result = NestedSeries(self._flat_nest[item_name]) + # Assigning these properties directly in order to avoid any complication + # or interference with the inherited pd.Series constructor. + result.nest_name = self._nest_name + result.flat_nest = self._flat_nest + return result + raise AttributeError(f"No attribute {item_name}") + + return {name: NestResolver(name, self) for name in self.nested_columns} def _resolve_dropna_target(self, on_nested, subset): """resolves the target layer for a given set of dropna kwargs""" diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index 765ae73..1ca1e0a 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -1,3 +1,7 @@ +import ast +from enum import Enum + + def _ensure_spacing(expr) -> str: """Ensure that an eval string has spacing""" single_val_operators = {"+", "-", "*", "/", "%", ">", "<", "|", "&", "~", "="} # omit "(" and ")" @@ -33,3 +37,38 @@ def _ensure_spacing(expr) -> str: spaced_expr += " " i += 1 return spaced_expr + + +class NestingType(Enum): + """Types of sub-expressions possible in a NestedFrame string expression.""" + + BASE = "base" + NESTED = "nested" + + +def _expr_nesting_type(node: ast.expr | None) -> set[NestingType]: + if not isinstance(node, ast.expr): + return set() + if isinstance(node, ast.Name): + return {NestingType.BASE} + if isinstance(node, ast.Attribute): + return {NestingType.NESTED} + sources = ( + [getattr(node, "left", None), getattr(node, "right", None)] + + getattr(node, "values", []) + + getattr(node, "comparators", []) + ) + result: set[NestingType] = set() + for s in sources: + result.update(_expr_nesting_type(s)) + return result + + +def check_expr_nesting(expr: str) -> set[NestingType]: + """ + Given a string expression, parse it and visit the resulting AST, surfacing + the nesting types. The purpose is to identify expressions that attempt + to mix base and nested columns, which will need to be handled specially. + """ + expr_tree = ast.parse(expr, mode="eval").body + return set(_expr_nesting_type(expr_tree)) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index b29881d..1f506a8 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -4,6 +4,7 @@ import pytest from nested_pandas import NestedFrame from nested_pandas.datasets import generate_data +from nested_pandas.nestedframe.core import NestedSeries from pandas.testing import assert_frame_equal @@ -689,3 +690,44 @@ def cols_allclose(col1, col2): assert_frame_equal( result, pd.DataFrame({"allclose": [True, True, True]}, index=pd.Index([0, 1, 2], name="idx")) ) + + +def test_scientific_notation(): + """ + Test that NestedFrame.query handles constants that are written in scientific notation. + """ + # https://github.com/lincc-frameworks/nested-pandas/issues/59 + base = NestedFrame({"a": [1, 1e-2, 3]}, index=[0, 1, 2]) + selected = base.query("a > 1e-1") + assert list(selected.index) == [0, 2] + + +def test_eval(): + """ + Test basic behavior of NestedFrame.eval, and that it can handle nested references + the same as the nest accessor. + """ + nf = NestedFrame( + data={"a": [1, 2, 3], "b": [2, 4, 6]}, + index=pd.Index([0, 1, 2], name="idx"), + ) + + to_pack = pd.DataFrame( + data={ + "time": [1, 2, 3, 1, 2, 4, 2, 1, 4], + "c": [0, 2, 4, 10, 4, 3, 1, 4, 1], + "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], + }, + index=pd.Index([0, 0, 0, 1, 1, 1, 2, 2, 2], name="idx"), + ) + + nf = nf.add_nested(to_pack, "packed") + p5 = nf.eval("packed.d > 5") + assert isinstance(p5, NestedSeries) + assert p5.any() + assert not p5.all() + assert list(p5.loc[p5].index) == [0, 2] + + r1 = nf.eval("packed.c + packed.d") + r2 = nf["packed"].nest["c"] + nf["packed"].nest["d"] + assert (r1 == r2).all() diff --git a/tests/nested_pandas/nestedframe/test_nestedframe_utils.py b/tests/nested_pandas/nestedframe/test_nestedframe_utils.py deleted file mode 100644 index 3908cb8..0000000 --- a/tests/nested_pandas/nestedframe/test_nestedframe_utils.py +++ /dev/null @@ -1,17 +0,0 @@ -import pytest -from nested_pandas.nestedframe import utils - - -@pytest.mark.parametrize( - "in_out", - [ - ("a>3", "a > 3"), - ("test.a>5&b==2", "test.a > 5 & b == 2"), - ("b > 3", "b > 3"), - ("(a.b > 3)&(a.c == 'f')", "(a.b > 3) & (a.c == 'f')"), - ], -) -def test_ensure_spacing(in_out): - """test a set of input queries to make sure spacing is done correctly""" - expr, output = in_out - assert utils._ensure_spacing(expr) == output diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py index 5397d75..d76d592 100644 --- a/tests/nested_pandas/utils/test_utils.py +++ b/tests/nested_pandas/utils/test_utils.py @@ -2,6 +2,7 @@ import pandas as pd import pytest from nested_pandas import NestedFrame +from nested_pandas.nestedframe.utils import NestingType, check_expr_nesting from nested_pandas.utils import count_nested @@ -43,3 +44,29 @@ def test_count_nested(join): else: assert total_counts.columns.tolist() == ["n_nested"] assert label_counts.columns.tolist() == ["n_nested_a", "n_nested_b"] + + +def test_check_expr_nesting(): + """ + Test the correctness of the evaluation expression pre-flight checks, which are + used to ensure that an expression-based query does not try to combine base and nested + sub-expressions. + """ + assert check_expr_nesting("a > 2 & nested.c > 1") == {NestingType.NESTED, NestingType.BASE} + assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {NestingType.NESTED} + assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {NestingType.BASE} + assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == { + NestingType.NESTED, + NestingType.BASE, + } + + # NOTE: this correctly captures the desired behavior here, but suggests that the two nests + # are interoperable, which is too strong a claim. + assert check_expr_nesting("a.b > 2 & c.d < 5") == {NestingType.NESTED} + + assert check_expr_nesting("a>3") == {NestingType.BASE} + assert check_expr_nesting("a > 3") == {NestingType.BASE} + assert check_expr_nesting("test.a>5&b==2") == {NestingType.NESTED, NestingType.BASE} + assert check_expr_nesting("test.a > 5 & b == 2") == {NestingType.NESTED, NestingType.BASE} + assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {NestingType.NESTED} + assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {NestingType.NESTED}