lincc-frameworks · dougbrn · Apr 9, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -6,6 +6,8 @@
 from nested_pandas.series import packer
 from nested_pandas.series.dtype import NestedDtype
 
+from .utils import _ensure_spacing
+
 
 class NestedFrame(pd.DataFrame):
     """A Pandas Dataframe extension with support for nested structure.
@@ -58,3 +60,97 @@ def add_nested(self, nested, name) -> Self:  # type: ignore[name-defined] # noqa
         packed = packer.pack_flat(nested, name=name)
         label = packed.name
         return self.assign(**{f"{label}": packed})
+
+    def _split_query(self, expr) -> dict:
+        """Splits a pandas query into multiple subqueries for nested and base layers"""
+        # Ensure query has needed spacing for upcoming split
+        expr = _ensure_spacing(expr)
+        nest_exprs = {col: [] for col in self.nested_columns + ["base"]}  # type: dict
+        split_expr = expr.split(" ")
+
+        i = 0
+        current_focus = "base"
+        while i < len(split_expr):
+            expr_slice = split_expr[i].strip("()")
+            # Check if it's a nested column
+            if self._is_known_hierarchical_column(expr_slice):
+                nested, colname = split_expr[i].split(".")
+                current_focus = nested.strip("()")
+                # account for parentheses
+                j = 0
+                while j < len(nested):
+                    if nested[j] == "(":
+                        nest_exprs[current_focus].append("(")
+                    j += 1
+                nest_exprs[current_focus].append(colname)
+            # or if it's a top-level column
+            elif expr_slice in self.columns:
+                current_focus = "base"
+                nest_exprs[current_focus].append(split_expr[i])
+            else:
+                nest_exprs[current_focus].append(split_expr[i])
+            i += 1
+        return {expr: " ".join(nest_exprs[expr]) for expr in nest_exprs if len(nest_exprs[expr]) > 0}
+
+    def query(self, expr) -> Self:  # type: ignore[name-defined] # noqa: F821
+        """
+        Query the columns of a NestedFrame with a boolean expression. Specified
+        queries can target nested columns in addition to the typical column set
+
+        Parameters
+        ----------
+        expr : str
+            The query string to evaluate.
+
+            Access nested columns using `nested_df.nested_col` (where
+            `nested_df` refers to a particular nested dataframe and
+            `nested_col` is a column of that nested dataframe).
+
+            You can refer to variables
+            in the environment by prefixing them with an '@' character like
+            ``@a + b``.
+
+            You can refer to column names that are not valid Python variable names
+            by surrounding them in backticks. Thus, column names containing spaces
+            or punctuations (besides underscores) or starting with digits must be
+            surrounded by backticks. (For example, a column named "Area (cm^2)" would
+            be referenced as ```Area (cm^2)```). Column names which are Python keywords
+            (like "list", "for", "import", etc) cannot be used.
+
+            For example, if one of your columns is called ``a a`` and you want
+            to sum it with ``b``, your query should be ```a a` + b``.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame resulting from the provided query expression.
+
+        Notes
+        -----
+        Queries that target a particular nested structure return a dataframe
+        with rows of that particular nested structure filtered. For example,
+        querying the NestedFrame "df" with nested structure "my_nested" as
+        below will return all rows of df, but with mynested filtered by the
+        condition:
+
+        >>> df.query("mynested.a > 2")
+        """
+
+        # Rebuild queries for each specified nested/base layer
+        exprs_to_use = self._split_query(expr)
+
+        # For now (simplicity), limit query to only operating on one layer
+        if len(exprs_to_use.keys()) != 1:
+            raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
+
+        # Send queries to layers
+        # We'll only execute 1 per the Error above, but the loop will be useful
+        # for when/if we allow multi-layer queries
+        result = self.copy()
+        for expr in exprs_to_use:
+            if expr == "base":
+                result = super().query(exprs_to_use["base"], inplace=False)
+            else:
+                # TODO: does not work with queries that empty the dataframe
+                result[expr] = result[expr].nest.query_flat(exprs_to_use[expr])
+        return result
diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py
@@ -0,0 +1,35 @@
+def _ensure_spacing(expr) -> str:
+    """Ensure that an eval string has spacing"""
+    single_val_operators = {"+", "-", "*", "/", "%", ">", "<", "|", "&", "~", "="}  # omit "(" and ")"
+    check_for_doubles = {"=", "/", "*", ">", "<"}
+    double_val_operators = {"==", "//", "**", ">=", "<="}
+    expr_list = expr
+
+    i = 0
+    spaced_expr = ""
+    while i < len(expr_list):
+        if expr_list[i] not in single_val_operators:
+            spaced_expr += expr_list[i]
+        else:
+            if expr_list[i] in check_for_doubles:
+                if "".join(expr_list[i : i + 2]) in double_val_operators:
+                    if spaced_expr[-1] != " ":
+                        spaced_expr += " "
+                    spaced_expr += expr_list[i : i + 2]
+                    if expr_list[i + 2] != " ":
+                        spaced_expr += " "
+                    i += 1  # skip ahead an extra time
+                else:
+                    if spaced_expr[-1] != " ":
+                        spaced_expr += " "
+                    spaced_expr += expr_list[i]
+                    if expr_list[i + 1] != " ":
+                        spaced_expr += " "
+            else:
+                if spaced_expr[-1] != " ":
+                    spaced_expr += " "
+                spaced_expr += expr_list[i]
+                if expr_list[i + 1] != " ":
+                    spaced_expr += " "
+        i += 1
+    return spaced_expr
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import pytest
 from nested_pandas import NestedFrame
 
 
@@ -74,3 +75,29 @@ def test_add_nested():
 
     assert "nested" in base.columns
     assert base.nested.nest.to_flat().equals(nested)
+
+
+def test_query():
+    """Test that NestedFrame.query handles nested queries correctly"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    # Test vanilla queries
+    base = base.add_nested(nested, "nested")
+    assert len(base.query("a > 2")) == 1
+
+    # Check for the multi-layer error
+    with pytest.raises(ValueError):
+        base.query("a > 2 & nested.c > 1")
+
+    # Test nested queries
+    nest_queried = base.query("nested.c > 1")
+    assert len(nest_queried.nested.nest.to_flat()) == 5
+
+    nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
+    assert len(nest_queried.nested.nest.to_flat()) == 4
diff --git a/tests/nested_pandas/nestedframe/test_utils.py b/tests/nested_pandas/nestedframe/test_utils.py
@@ -0,0 +1,17 @@
+import pytest
+from nested_pandas.nestedframe import utils
+
+
+@pytest.mark.parametrize(
+    "in_out",
+    [
+        ("a>3", "a > 3"),
+        ("test.a>5&b==2", "test.a > 5 & b == 2"),
+        ("b > 3", "b > 3"),
+        ("(a.b > 3)&(a.c == 'f')", "(a.b > 3) & (a.c == 'f')"),
+    ],
+)
+def test_ensure_spacing(in_out):
+    """test a set of input queries to make sure spacing is done correctly"""
+    expr, output = in_out
+    assert utils._ensure_spacing(expr) == output