Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial query wrapping #19

Merged
merged 5 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from nested_pandas.series import packer
from nested_pandas.series.dtype import NestedDtype

from .utils import _ensure_spacing


class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.
Expand Down Expand Up @@ -58,3 +60,97 @@ def add_nested(self, nested, name) -> Self: # type: ignore[name-defined] # noqa
packed = packer.pack_flat(nested, name=name)
label = packed.name
return self.assign(**{f"{label}": packed})

def _split_query(self, expr) -> dict:
"""Splits a pandas query into multiple subqueries for nested and base layers"""
# Ensure query has needed spacing for upcoming split
expr = _ensure_spacing(expr)
nest_exprs = {col: [] for col in self.nested_columns + ["base"]} # type: dict
split_expr = expr.split(" ")

i = 0
current_focus = "base"
while i < len(split_expr):
expr_slice = split_expr[i].strip("()")
# Check if it's a nested column
if self._is_known_hierarchical_column(expr_slice):
nested, colname = split_expr[i].split(".")
current_focus = nested.strip("()")
# account for parentheses
j = 0
while j < len(nested):
if nested[j] == "(":
nest_exprs[current_focus].append("(")
j += 1
nest_exprs[current_focus].append(colname)
# or if it's a top-level column
elif expr_slice in self.columns:
current_focus = "base"
nest_exprs[current_focus].append(split_expr[i])
else:
nest_exprs[current_focus].append(split_expr[i])
i += 1
return {expr: " ".join(nest_exprs[expr]) for expr in nest_exprs if len(nest_exprs[expr]) > 0}

def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821
"""
Query the columns of a NestedFrame with a boolean expression. Specified
queries can target nested columns in addition to the typical column set

Parameters
----------
expr : str
The query string to evaluate.

Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).

You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.

You can refer to column names that are not valid Python variable names
by surrounding them in backticks. Thus, column names containing spaces
or punctuations (besides underscores) or starting with digits must be
surrounded by backticks. (For example, a column named "Area (cm^2)" would
be referenced as ```Area (cm^2)```). Column names which are Python keywords
(like "list", "for", "import", etc) cannot be used.

For example, if one of your columns is called ``a a`` and you want
to sum it with ``b``, your query should be ```a a` + b``.

Returns
-------
DataFrame
DataFrame resulting from the provided query expression.

Notes
-----
Queries that target a particular nested structure return a dataframe
with rows of that particular nested structure filtered. For example,
querying the NestedFrame "df" with nested structure "my_nested" as
below will return all rows of df, but with mynested filtered by the
condition:

>>> df.query("mynested.a > 2")
"""

# Rebuild queries for each specified nested/base layer
exprs_to_use = self._split_query(expr)

# For now (simplicity), limit query to only operating on one layer
if len(exprs_to_use.keys()) != 1:
raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")

# Send queries to layers
# We'll only execute 1 per the Error above, but the loop will be useful
# for when/if we allow multi-layer queries
result = self.copy()
for expr in exprs_to_use:
if expr == "base":
result = super().query(exprs_to_use["base"], inplace=False)
else:
# TODO: does not work with queries that empty the dataframe
result[expr] = result[expr].nest.query_flat(exprs_to_use[expr])
return result
35 changes: 35 additions & 0 deletions src/nested_pandas/nestedframe/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
def _ensure_spacing(expr) -> str:
"""Ensure that an eval string has spacing"""
single_val_operators = {"+", "-", "*", "/", "%", ">", "<", "|", "&", "~", "="} # omit "(" and ")"
check_for_doubles = {"=", "/", "*", ">", "<"}
double_val_operators = {"==", "//", "**", ">=", "<="}
expr_list = expr

i = 0
spaced_expr = ""
while i < len(expr_list):
if expr_list[i] not in single_val_operators:
spaced_expr += expr_list[i]
else:
if expr_list[i] in check_for_doubles:
if "".join(expr_list[i : i + 2]) in double_val_operators:
if spaced_expr[-1] != " ":
spaced_expr += " "
spaced_expr += expr_list[i : i + 2]
if expr_list[i + 2] != " ":
spaced_expr += " "
i += 1 # skip ahead an extra time
else:
if spaced_expr[-1] != " ":
spaced_expr += " "
spaced_expr += expr_list[i]
if expr_list[i + 1] != " ":
spaced_expr += " "
else:
if spaced_expr[-1] != " ":
spaced_expr += " "
spaced_expr += expr_list[i]
if expr_list[i + 1] != " ":
spaced_expr += " "
i += 1
return spaced_expr
27 changes: 27 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import pytest
from nested_pandas import NestedFrame


Expand Down Expand Up @@ -74,3 +75,29 @@ def test_add_nested():

assert "nested" in base.columns
assert base.nested.nest.to_flat().equals(nested)


def test_query():
"""Test that NestedFrame.query handles nested queries correctly"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

# Test vanilla queries
base = base.add_nested(nested, "nested")
assert len(base.query("a > 2")) == 1

# Check for the multi-layer error
with pytest.raises(ValueError):
base.query("a > 2 & nested.c > 1")

# Test nested queries
nest_queried = base.query("nested.c > 1")
assert len(nest_queried.nested.nest.to_flat()) == 5

nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
assert len(nest_queried.nested.nest.to_flat()) == 4
17 changes: 17 additions & 0 deletions tests/nested_pandas/nestedframe/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pytest
from nested_pandas.nestedframe import utils


@pytest.mark.parametrize(
"in_out",
[
("a>3", "a > 3"),
("test.a>5&b==2", "test.a > 5 & b == 2"),
("b > 3", "b > 3"),
("(a.b > 3)&(a.c == 'f')", "(a.b > 3) & (a.c == 'f')"),
],
)
def test_ensure_spacing(in_out):
"""test a set of input queries to make sure spacing is done correctly"""
expr, output = in_out
assert utils._ensure_spacing(expr) == output