Skip to content

Commit

Permalink
feat: add support for pandas 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mbelak-dtml committed Oct 4, 2023
1 parent acade1c commit cd7168b
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 62 deletions.
17 changes: 9 additions & 8 deletions edvart/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
import numpy as np
import pandas as pd

try:
import pyarrow # pylint: disable=unused-import
except ImportError:
PYARROW_PANDAS_BACKEND_AVAILABLE = False
else:
PYARROW_PANDAS_BACKEND_AVAILABLE = pd.__version__ >= "2.0"


class DataType(IntEnum):
"""Class describe possible data types."""
Expand Down Expand Up @@ -85,13 +92,7 @@ def is_numeric(series: pd.Series) -> bool:
"""
if is_missing(series):
return False
# When an unkown dtype is encountered, `np.issubdtype(series.dtype, np.number)`
# raises a TypeError. This happens for example if `series` is `pd.Categorical`
# If the dtype is unknown, we treat it as non-numeric, therefore return False.
try:
return np.issubdtype(series.dtype, np.number)
except TypeError:
return False
return pd.api.types.is_numeric_dtype(series)


def is_missing(series: pd.Series) -> bool:
Expand Down Expand Up @@ -179,7 +180,7 @@ def is_date(series: pd.Series) -> bool:
if contains_numerics:
return False
try:
converted_series = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)
converted_series = pd.to_datetime(series, errors="coerce", format="mixed")
except ValueError:
return False
return converted_series.notna().all()
29 changes: 24 additions & 5 deletions edvart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import pandas as pd
import statsmodels.api as sm
from scipy import stats

from edvart.data_types import is_numeric

Expand Down Expand Up @@ -76,9 +77,7 @@ def reindex_to_datetime(
Reindexed df.
"""
df = df.copy()
new_index = pd.to_datetime(
df[datetime_column], unit=unit, origin=origin, infer_datetime_format=True
)
new_index = pd.to_datetime(df[datetime_column], unit=unit, origin=origin, format="mixed")
if keep_index is not None:
df[keep_index] = df.index
df = df.drop(datetime_column, axis="columns")
Expand Down Expand Up @@ -213,6 +212,8 @@ def median_absolute_deviation(series: pd.Series) -> float:
-------
float
"""
if series.isnull().all():
return float("nan")
return median((series - series.mean()).abs())


Expand Down Expand Up @@ -245,6 +246,8 @@ def minimum(series: pd.Series) -> float:
-------
float
"""
if series.isnull().all():
return float("nan")
return series.min()


Expand All @@ -261,6 +264,8 @@ def maximum(series: pd.Series) -> float:
-------
float
"""
if series.isnull().all():
return float("nan")
return series.max()


Expand All @@ -277,6 +282,8 @@ def quartile1(series: pd.Series) -> float:
-------
float
"""
if series.isnull().all():
return float("nan")
return series.quantile(0.25)


Expand All @@ -293,6 +300,8 @@ def quartile3(series: pd.Series) -> float:
-------
float
"""
if series.isnull().all():
return float("nan")
return series.quantile(0.75)


Expand All @@ -309,6 +318,8 @@ def mean(series: pd.Series) -> float:
-------
float
"""
if series.isnull().all():
return float("nan")
return series.mean()


Expand Down Expand Up @@ -395,6 +406,8 @@ def std(series: pd.Series) -> float:
-------
float
"""
if series.isnull().all():
return float("nan")
return series.std()


Expand All @@ -411,6 +424,8 @@ def mad(series: pd.Series) -> Any:
-------
float
"""
if series.isnull().all():
return float("nan")
return (series - series.mean()).abs().mean()


Expand All @@ -427,7 +442,9 @@ def kurtosis(series: pd.Series) -> Any:
-------
float
"""
return series.kurtosis()
if series.isnull().all():
return float("nan")
return stats.kurtosis(series)


def skewness(series: pd.Series) -> Any:
Expand All @@ -443,7 +460,9 @@ def skewness(series: pd.Series) -> Any:
-------
float
"""
return series.skew()
if series.isnull().all():
return float("nan")
return stats.skew(series)


def sum_(series: pd.Series) -> float:
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ exclude = ["tests"]
[tool.poetry.dependencies]
python = ">=3.8, <3.12"
ipykernel = "*"
pandas = "^1.5"
pandas = ">=1.5, <2.1"
numpy = "*"
matplotlib = "*"
seaborn = "^0.12"
Expand All @@ -28,10 +28,12 @@ umap-learn = { version = "^0.5.4", optional = true}
# which also installs an older version of llmvlite, which is incompatible
# with newer version of LLVM binaries.
numba = { version = "^0.57", optional = true }
pyarrow = { version = "^13.0.0", optional = true }

[tool.poetry.extras]
umap = ["umap-learn", "numba"]
all = ["umap-learn", "numba"]
arrow = ["pyarrow"]
all = ["umap-learn", "numba", "arrow"]

[tool.poetry.dev-dependencies]
pytest-cov = "^2.8"
Expand Down
8 changes: 8 additions & 0 deletions tests/pyarrow_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pytest

from edvart.data_types import PYARROW_PANDAS_BACKEND_AVAILABLE

if PYARROW_PANDAS_BACKEND_AVAILABLE:
pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False, True])
else:
pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False])
11 changes: 8 additions & 3 deletions tests/test_bivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@
from edvart.report_sections.code_string_formatting import get_code
from edvart.report_sections.section_base import Verbosity

from .pyarrow_utils import pyarrow_parameterize

def get_test_df() -> pd.DataFrame:

def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame:
test_df = pd.DataFrame(data=[[1.1, "a"], [2.2, "b"], [3.3, "c"]], columns=["A", "B"])
if pyarrow_dtypes:
test_df = test_df.convert_dtypes(dtype_backend="pyarrow")

return test_df

Expand Down Expand Up @@ -407,9 +411,10 @@ def test_imports_verbosity_low_different_subsection_verbosities():
assert set(exported_imports) == set(expected_imports)


def test_show():
@pyarrow_parameterize
def test_show(pyarrow_dtypes: bool):
bivariate_section = BivariateAnalysis()
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
with redirect_stdout(None):
bivariate_section.show(get_test_df())
bivariate_section.show(get_test_df(pyarrow_dtypes=pyarrow_dtypes))
8 changes: 5 additions & 3 deletions tests/test_data_type_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ def test_inference():
== data_types.DataType.NUMERIC
), "Should be numeric type"
assert (
data_types.infer_data_type(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02"]))
data_types.infer_data_type(
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
)
== data_types.DataType.DATE
), "Should be date type"
assert (
Expand Down Expand Up @@ -95,13 +97,13 @@ def test_boolean_series():

def test_date_series():
assert data_types.is_date(
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02"])
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
), "Should be type date"
assert data_types.is_date(
pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"])
), "Should be type date"
assert not data_types.is_date(
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", "nan"])
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"])
), "Should not be type date"
assert not data_types.is_date(
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3])
Expand Down
49 changes: 31 additions & 18 deletions tests/test_group_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,23 @@
)
from edvart.report_sections.section_base import Verbosity

from .pyarrow_utils import pyarrow_parameterize

# Workaround to prevent multiple browser tabs opening with figures
plotly.io.renderers.default = "json"


def get_test_df():
return pd.DataFrame(
def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame:
test_df = pd.DataFrame(
data=[
["P" if np.random.uniform() < 0.4 else "N", 1.5 * i, "X" if i % 2 == 0 else "Y"]
for i in range(60)
],
columns=["A", "B", "C"],
)
if pyarrow_dtypes:
test_df = test_df.convert_dtypes(dtype_backend="pyarrow")
return test_df


def test_default_config_verbosity():
Expand All @@ -47,15 +52,19 @@ def test_invalid_verbosities():
GroupAnalysis(groupby=[], verbosity=-1)


def test_groupby_nonexistent_col():
@pyarrow_parameterize
def test_groupby_nonexistent_col(pyarrow_dtypes: bool):
with pytest.raises(ValueError):
show_group_analysis(df=get_test_df(), groupby=["non-existent"])
show_group_analysis(df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"])
with pytest.raises(ValueError):
group_missing_values(df=get_test_df(), groupby=["non-existent"])
group_missing_values(
df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"]
)


def test_static_methods():
df = get_test_df()
@pyarrow_parameterize
def test_static_methods(pyarrow_dtypes: bool):
df = get_test_df(pyarrow_dtypes=pyarrow_dtypes)
with redirect_stdout(None):
show_group_analysis(df=df, groupby="C")
show_group_analysis(df=df, groupby=["C"], columns=["A"])
Expand All @@ -80,8 +89,9 @@ def test_static_methods():
overlaid_histograms(df, groupby=["B"], column="B")


def test_code_export_verbosity_low():
df = get_test_df()
@pyarrow_parameterize
def test_code_export_verbosity_low(pyarrow_dtypes: bool):
df = get_test_df(pyarrow_dtypes=pyarrow_dtypes)
group_section = GroupAnalysis(groupby="B", verbosity=Verbosity.LOW)

# Export code
Expand All @@ -96,8 +106,9 @@ def test_code_export_verbosity_low():
assert exported_code[0] == expected_code[0], "Exported code mismatch"


def test_code_export_verbosity_medium():
df = get_test_df()
@pyarrow_parameterize
def test_code_export_verbosity_medium(pyarrow_dtypes: bool):
df = get_test_df(pyarrow_dtypes=pyarrow_dtypes)
group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.MEDIUM)

# Export code
Expand All @@ -122,8 +133,9 @@ def test_code_export_verbosity_medium():
assert expected_line == exported_line, "Exported code mismatch"


def test_code_export_verbosity_high():
df = get_test_df()
@pyarrow_parameterize
def test_code_export_verbosity_high(pyarrow_dtypes: bool):
df = get_test_df(pyarrow_dtypes=pyarrow_dtypes)
group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.HIGH)

# Export code
Expand Down Expand Up @@ -176,8 +188,9 @@ def test_code_export_verbosity_high():
assert expected_line == exported_line, "Exported code mismatch"


def test_columns_parameter():
df = get_test_df()
@pyarrow_parameterize
def test_columns_parameter(pyarrow_dtypes: bool):
df = get_test_df(pyarrow_dtypes=pyarrow_dtypes)
ga = GroupAnalysis(groupby="A", columns=["B"])
assert ga.groupby == ["A"]
assert ga.columns == ["B"]
Expand All @@ -192,14 +205,14 @@ def test_columns_parameter():


def test_column_list_not_modified():
df = get_test_df()
columns = ["C"]
GroupAnalysis(groupby=["A"], columns=columns)
assert columns == ["C"], "Column list modified"


def test_show():
df = get_test_df()
@pyarrow_parameterize
def test_show(pyarrow_dtypes: bool):
df = get_test_df(pyarrow_dtypes=pyarrow_dtypes)
group_section = GroupAnalysis(groupby="A")
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
Expand Down
Loading

0 comments on commit cd7168b

Please sign in to comment.