diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index dfe4db6..4970930 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -6,6 +6,8 @@ from nested_pandas.series import packer from nested_pandas.series.dtype import NestedDtype +from .utils import _ensure_spacing + class NestedFrame(pd.DataFrame): """A Pandas Dataframe extension with support for nested structure. @@ -58,3 +60,97 @@ def add_nested(self, nested, name) -> Self: # type: ignore[name-defined] # noqa packed = packer.pack_flat(nested, name=name) label = packed.name return self.assign(**{f"{label}": packed}) + + def _split_query(self, expr) -> dict: + """Splits a pandas query into multiple subqueries for nested and base layers""" + # Ensure query has needed spacing for upcoming split + expr = _ensure_spacing(expr) + nest_exprs = {col: [] for col in self.nested_columns + ["base"]} # type: dict + split_expr = expr.split(" ") + + i = 0 + current_focus = "base" + while i < len(split_expr): + expr_slice = split_expr[i].strip("()") + # Check if it's a nested column + if self._is_known_hierarchical_column(expr_slice): + nested, colname = split_expr[i].split(".") + current_focus = nested.strip("()") + # account for parentheses + j = 0 + while j < len(nested): + if nested[j] == "(": + nest_exprs[current_focus].append("(") + j += 1 + nest_exprs[current_focus].append(colname) + # or if it's a top-level column + elif expr_slice in self.columns: + current_focus = "base" + nest_exprs[current_focus].append(split_expr[i]) + else: + nest_exprs[current_focus].append(split_expr[i]) + i += 1 + return {expr: " ".join(nest_exprs[expr]) for expr in nest_exprs if len(nest_exprs[expr]) > 0} + + def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 + """ + Query the columns of a NestedFrame with a boolean expression. Specified + queries can target nested columns in addition to the typical column set + + Parameters + ---------- + expr : str + The query string to evaluate. + + Access nested columns using `nested_df.nested_col` (where + `nested_df` refers to a particular nested dataframe and + `nested_col` is a column of that nested dataframe). + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + Returns + ------- + DataFrame + DataFrame resulting from the provided query expression. + + Notes + ----- + Queries that target a particular nested structure return a dataframe + with rows of that particular nested structure filtered. For example, + querying the NestedFrame "df" with nested structure "my_nested" as + below will return all rows of df, but with mynested filtered by the + condition: + + >>> df.query("mynested.a > 2") + """ + + # Rebuild queries for each specified nested/base layer + exprs_to_use = self._split_query(expr) + + # For now (simplicity), limit query to only operating on one layer + if len(exprs_to_use.keys()) != 1: + raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") + + # Send queries to layers + # We'll only execute 1 per the Error above, but the loop will be useful + # for when/if we allow multi-layer queries + result = self.copy() + for expr in exprs_to_use: + if expr == "base": + result = super().query(exprs_to_use["base"], inplace=False) + else: + # TODO: does not work with queries that empty the dataframe + result[expr] = result[expr].nest.query_flat(exprs_to_use[expr]) + return result diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py new file mode 100644 index 0000000..765ae73 --- /dev/null +++ b/src/nested_pandas/nestedframe/utils.py @@ -0,0 +1,35 @@ +def _ensure_spacing(expr) -> str: + """Ensure that an eval string has spacing""" + single_val_operators = {"+", "-", "*", "/", "%", ">", "<", "|", "&", "~", "="} # omit "(" and ")" + check_for_doubles = {"=", "/", "*", ">", "<"} + double_val_operators = {"==", "//", "**", ">=", "<="} + expr_list = expr + + i = 0 + spaced_expr = "" + while i < len(expr_list): + if expr_list[i] not in single_val_operators: + spaced_expr += expr_list[i] + else: + if expr_list[i] in check_for_doubles: + if "".join(expr_list[i : i + 2]) in double_val_operators: + if spaced_expr[-1] != " ": + spaced_expr += " " + spaced_expr += expr_list[i : i + 2] + if expr_list[i + 2] != " ": + spaced_expr += " " + i += 1 # skip ahead an extra time + else: + if spaced_expr[-1] != " ": + spaced_expr += " " + spaced_expr += expr_list[i] + if expr_list[i + 1] != " ": + spaced_expr += " " + else: + if spaced_expr[-1] != " ": + spaced_expr += " " + spaced_expr += expr_list[i] + if expr_list[i + 1] != " ": + spaced_expr += " " + i += 1 + return spaced_expr diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 8c70da3..15f9d9f 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest from nested_pandas import NestedFrame @@ -74,3 +75,29 @@ def test_add_nested(): assert "nested" in base.columns assert base.nested.nest.to_flat().equals(nested) + + +def test_query(): + """Test that NestedFrame.query handles nested queries correctly""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + # Test vanilla queries + base = base.add_nested(nested, "nested") + assert len(base.query("a > 2")) == 1 + + # Check for the multi-layer error + with pytest.raises(ValueError): + base.query("a > 2 & nested.c > 1") + + # Test nested queries + nest_queried = base.query("nested.c > 1") + assert len(nest_queried.nested.nest.to_flat()) == 5 + + nest_queried = base.query("(nested.c > 1) and (nested.d>2)") + assert len(nest_queried.nested.nest.to_flat()) == 4 diff --git a/tests/nested_pandas/nestedframe/test_utils.py b/tests/nested_pandas/nestedframe/test_utils.py new file mode 100644 index 0000000..3908cb8 --- /dev/null +++ b/tests/nested_pandas/nestedframe/test_utils.py @@ -0,0 +1,17 @@ +import pytest +from nested_pandas.nestedframe import utils + + +@pytest.mark.parametrize( + "in_out", + [ + ("a>3", "a > 3"), + ("test.a>5&b==2", "test.a > 5 & b == 2"), + ("b > 3", "b > 3"), + ("(a.b > 3)&(a.c == 'f')", "(a.b > 3) & (a.c == 'f')"), + ], +) +def test_ensure_spacing(in_out): + """test a set of input queries to make sure spacing is done correctly""" + expr, output = in_out + assert utils._ensure_spacing(expr) == output