diff --git a/edvart/data_types.py b/edvart/data_types.py index 4647b08..26a18fa 100644 --- a/edvart/data_types.py +++ b/edvart/data_types.py @@ -5,6 +5,13 @@ import numpy as np import pandas as pd +try: + import pyarrow # pylint: disable=unused-import +except ImportError: + PYARROW_PANDAS_BACKEND_AVAILABLE = False +else: + PYARROW_PANDAS_BACKEND_AVAILABLE = pd.__version__ >= "2.0" + class DataType(IntEnum): """Class describe possible data types.""" @@ -85,13 +92,7 @@ def is_numeric(series: pd.Series) -> bool: """ if is_missing(series): return False - # When an unkown dtype is encountered, `np.issubdtype(series.dtype, np.number)` - # raises a TypeError. This happens for example if `series` is `pd.Categorical` - # If the dtype is unknown, we treat it as non-numeric, therefore return False. - try: - return np.issubdtype(series.dtype, np.number) - except TypeError: - return False + return pd.api.types.is_numeric_dtype(series) def is_missing(series: pd.Series) -> bool: @@ -179,7 +180,7 @@ def is_date(series: pd.Series) -> bool: if contains_numerics: return False try: - converted_series = pd.to_datetime(series, errors="coerce", infer_datetime_format=True) + converted_series = pd.to_datetime(series, errors="coerce", format="mixed") except ValueError: return False return converted_series.notna().all() diff --git a/edvart/utils.py b/edvart/utils.py index 2ff67ab..183e050 100755 --- a/edvart/utils.py +++ b/edvart/utils.py @@ -6,6 +6,7 @@ import pandas as pd import statsmodels.api as sm +from scipy import stats from edvart.data_types import is_numeric @@ -76,9 +77,7 @@ def reindex_to_datetime( Reindexed df. """ df = df.copy() - new_index = pd.to_datetime( - df[datetime_column], unit=unit, origin=origin, infer_datetime_format=True - ) + new_index = pd.to_datetime(df[datetime_column], unit=unit, origin=origin, format="mixed") if keep_index is not None: df[keep_index] = df.index df = df.drop(datetime_column, axis="columns") @@ -213,6 +212,8 @@ def median_absolute_deviation(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return median((series - series.mean()).abs()) @@ -245,6 +246,8 @@ def minimum(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.min() @@ -261,6 +264,8 @@ def maximum(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.max() @@ -277,6 +282,8 @@ def quartile1(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.quantile(0.25) @@ -293,6 +300,8 @@ def quartile3(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.quantile(0.75) @@ -309,6 +318,8 @@ def mean(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.mean() @@ -395,6 +406,8 @@ def std(series: pd.Series) -> float: ------- float """ + if series.isnull().all(): + return float("nan") return series.std() @@ -411,6 +424,8 @@ def mad(series: pd.Series) -> Any: ------- float """ + if series.isnull().all(): + return float("nan") return (series - series.mean()).abs().mean() @@ -427,7 +442,9 @@ def kurtosis(series: pd.Series) -> Any: ------- float """ - return series.kurtosis() + if series.isnull().all(): + return float("nan") + return stats.kurtosis(series) def skewness(series: pd.Series) -> Any: @@ -443,7 +460,9 @@ def skewness(series: pd.Series) -> Any: ------- float """ - return series.skew() + if series.isnull().all(): + return float("nan") + return stats.skew(series) def sum_(series: pd.Series) -> float: diff --git a/pyproject.toml b/pyproject.toml index e1ebb24..132a2f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ exclude = ["tests"] [tool.poetry.dependencies] python = ">=3.8, <3.12" ipykernel = "*" -pandas = "^1.5" +pandas = ">=1.5, <2.1" numpy = "*" matplotlib = "*" seaborn = "^0.12" @@ -28,10 +28,12 @@ umap-learn = { version = "^0.5.4", optional = true} # which also installs an older version of llmvlite, which is incompatible # with newer version of LLVM binaries. numba = { version = "^0.57", optional = true } +pyarrow = { version = "^13.0.0", optional = true } [tool.poetry.extras] umap = ["umap-learn", "numba"] -all = ["umap-learn", "numba"] +arrow = ["pyarrow"] +all = ["umap-learn", "numba", "arrow"] [tool.poetry.dev-dependencies] pytest-cov = "^2.8" diff --git a/tests/pyarrow_utils.py b/tests/pyarrow_utils.py new file mode 100644 index 0000000..6bbc8fe --- /dev/null +++ b/tests/pyarrow_utils.py @@ -0,0 +1,8 @@ +import pytest + +from edvart.data_types import PYARROW_PANDAS_BACKEND_AVAILABLE + +if PYARROW_PANDAS_BACKEND_AVAILABLE: + pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False, True]) +else: + pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False]) diff --git a/tests/test_bivariate_analysis.py b/tests/test_bivariate_analysis.py index bb71a5b..616b1d5 100644 --- a/tests/test_bivariate_analysis.py +++ b/tests/test_bivariate_analysis.py @@ -9,9 +9,13 @@ from edvart.report_sections.code_string_formatting import get_code from edvart.report_sections.section_base import Verbosity +from .pyarrow_utils import pyarrow_parameterize -def get_test_df() -> pd.DataFrame: + +def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: test_df = pd.DataFrame(data=[[1.1, "a"], [2.2, "b"], [3.3, "c"]], columns=["A", "B"]) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -407,9 +411,10 @@ def test_imports_verbosity_low_different_subsection_verbosities(): assert set(exported_imports) == set(expected_imports) -def test_show(): +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): bivariate_section = BivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) with redirect_stdout(None): - bivariate_section.show(get_test_df()) + bivariate_section.show(get_test_df(pyarrow_dtypes=pyarrow_dtypes)) diff --git a/tests/test_data_type_inference.py b/tests/test_data_type_inference.py index b418681..d1e3844 100644 --- a/tests/test_data_type_inference.py +++ b/tests/test_data_type_inference.py @@ -10,7 +10,9 @@ def test_inference(): == data_types.DataType.NUMERIC ), "Should be numeric type" assert ( - data_types.infer_data_type(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02"])) + data_types.infer_data_type( + pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) + ) == data_types.DataType.DATE ), "Should be date type" assert ( @@ -95,13 +97,13 @@ def test_boolean_series(): def test_date_series(): assert data_types.is_date( - pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02"]) + pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) ), "Should be type date" assert data_types.is_date( pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]) ), "Should be type date" assert not data_types.is_date( - pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", "nan"]) + pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]) ), "Should not be type date" assert not data_types.is_date( pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]) diff --git a/tests/test_group_analysis.py b/tests/test_group_analysis.py index b0710e3..257dace 100644 --- a/tests/test_group_analysis.py +++ b/tests/test_group_analysis.py @@ -21,18 +21,23 @@ ) from edvart.report_sections.section_base import Verbosity +from .pyarrow_utils import pyarrow_parameterize + # Workaround to prevent multiple browser tabs opening with figures plotly.io.renderers.default = "json" -def get_test_df(): - return pd.DataFrame( +def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: + test_df = pd.DataFrame( data=[ ["P" if np.random.uniform() < 0.4 else "N", 1.5 * i, "X" if i % 2 == 0 else "Y"] for i in range(60) ], columns=["A", "B", "C"], ) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") + return test_df def test_default_config_verbosity(): @@ -47,15 +52,19 @@ def test_invalid_verbosities(): GroupAnalysis(groupby=[], verbosity=-1) -def test_groupby_nonexistent_col(): +@pyarrow_parameterize +def test_groupby_nonexistent_col(pyarrow_dtypes: bool): with pytest.raises(ValueError): - show_group_analysis(df=get_test_df(), groupby=["non-existent"]) + show_group_analysis(df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"]) with pytest.raises(ValueError): - group_missing_values(df=get_test_df(), groupby=["non-existent"]) + group_missing_values( + df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"] + ) -def test_static_methods(): - df = get_test_df() +@pyarrow_parameterize +def test_static_methods(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) with redirect_stdout(None): show_group_analysis(df=df, groupby="C") show_group_analysis(df=df, groupby=["C"], columns=["A"]) @@ -80,8 +89,9 @@ def test_static_methods(): overlaid_histograms(df, groupby=["B"], column="B") -def test_code_export_verbosity_low(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_low(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="B", verbosity=Verbosity.LOW) # Export code @@ -96,8 +106,9 @@ def test_code_export_verbosity_low(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_medium(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_medium(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.MEDIUM) # Export code @@ -122,8 +133,9 @@ def test_code_export_verbosity_medium(): assert expected_line == exported_line, "Exported code mismatch" -def test_code_export_verbosity_high(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_high(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.HIGH) # Export code @@ -176,8 +188,9 @@ def test_code_export_verbosity_high(): assert expected_line == exported_line, "Exported code mismatch" -def test_columns_parameter(): - df = get_test_df() +@pyarrow_parameterize +def test_columns_parameter(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) ga = GroupAnalysis(groupby="A", columns=["B"]) assert ga.groupby == ["A"] assert ga.columns == ["B"] @@ -192,14 +205,14 @@ def test_columns_parameter(): def test_column_list_not_modified(): - df = get_test_df() columns = ["C"] GroupAnalysis(groupby=["A"], columns=columns) assert columns == ["C"], "Column list modified" -def test_show(): - df = get_test_df() +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) group_section = GroupAnalysis(groupby="A") with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/tests/test_multivariate_analysis.py b/tests/test_multivariate_analysis.py index 20ba3f2..2799330 100644 --- a/tests/test_multivariate_analysis.py +++ b/tests/test_multivariate_analysis.py @@ -15,8 +15,10 @@ from edvart.report_sections.section_base import Verbosity from edvart.utils import select_numeric_columns +from .pyarrow_utils import pyarrow_parameterize -def get_test_df() -> pd.DataFrame: + +def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: test_df = pd.DataFrame( data=[ [1.1, "a", 3.7, 3.9], @@ -27,6 +29,8 @@ def get_test_df() -> pd.DataFrame: ], columns=["A", "B", "C", "D"], ) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -86,7 +90,6 @@ def test_verbosity_propagation(): def test_negative_verbosities(): - test_df = get_test_df() with pytest.raises(ValueError): MultivariateAnalysis(verbosity=-2) with pytest.raises(ValueError): @@ -125,8 +128,9 @@ def test_section_adding(): ), "Subsection should be UMAP" -def test_code_export_verbosity_low(): - df = get_test_df() +@pyarrow_parameterize +def test_code_export_verbosity_low(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = multivariate_analysis.MultivariateAnalysis(verbosity=Verbosity.LOW) # Export code exported_cells = [] @@ -140,7 +144,8 @@ def test_code_export_verbosity_low(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_low_with_subsections(): +@pyarrow_parameterize +def test_code_export_verbosity_low_with_subsections(pyarrow_dtypes: bool): subsec = multivariate_analysis.MultivariateAnalysis.MultivariateAnalysisSubsection subsections = [subsec.ParallelCategories, subsec.PCA, subsec.ParallelCoordinates, subsec.PCA] if UMAP_AVAILABLE: @@ -178,7 +183,8 @@ def test_code_export_verbosity_low_with_subsections(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_medium_all_cols_valid(): +@pyarrow_parameterize +def test_code_export_verbosity_medium_all_cols_valid(pyarrow_dtypes: bool): all_numeric_df = pd.DataFrame( data=[[1.1, 1, -2], [2.2, 2, -5.3], [3.3, 3, 4]], columns=["col1", "col2", "col3"] ) @@ -204,7 +210,8 @@ def test_code_export_verbosity_medium_all_cols_valid(): assert expected_line == exported_line, "Exported code mismatch" -def test_generated_code_verbosity_1(): +@pyarrow_parameterize +def test_generated_code_verbosity_1(pyarrow_dtypes: bool): multivariate_section = multivariate_analysis.MultivariateAnalysis(verbosity=Verbosity.MEDIUM) df = get_test_df() @@ -240,8 +247,9 @@ def test_generated_code_verbosity_1(): assert expected_line == exported_line, "Exported code mismatch" -def test_generated_code_verbosity_2(): - df = get_test_df() +@pyarrow_parameterize +def test_generated_code_verbosity_2(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = multivariate_analysis.MultivariateAnalysis(verbosity=Verbosity.HIGH) multivariate_cells = [] @@ -301,10 +309,13 @@ def test_generated_code_verbosity_2(): assert expected_line == exported_line, "Exported code mismatch" -def test_verbosity_medium_non_categorical_col(): +@pyarrow_parameterize +def test_verbosity_medium_non_categorical_col(pyarrow_dtypes: bool): random_array = np.random.randint(low=1, high=40, size=(100, 3)) random_df = pd.DataFrame(data=random_array, columns=["integral", "floating", "cat"]) random_df = random_df.astype({"integral": int, "floating": float, "cat": "category"}) + if pyarrow_dtypes: + random_df = random_df.convert_dtypes(dtype_backend="pyarrow") subsec = multivariate_analysis.MultivariateAnalysis.MultivariateAnalysisSubsection multivariate_section = multivariate_analysis.MultivariateAnalysis( subsections=[subsec.ParallelCategories], verbosity=Verbosity.MEDIUM @@ -321,7 +332,8 @@ def test_verbosity_medium_non_categorical_col(): assert expected_line == exported_line, "Exported code mismatch" -def test_verbosity_low_different_subsection_verbosities(): +@pyarrow_parameterize +def test_verbosity_low_different_subsection_verbosities(pyarrow_dtypes: bool): subsections = [ MultivariateAnalysis.MultivariateAnalysisSubsection.PCA, MultivariateAnalysis.MultivariateAnalysisSubsection.PCA, @@ -330,7 +342,7 @@ def test_verbosity_low_different_subsection_verbosities(): ] if UMAP_AVAILABLE: subsections.insert(2, MultivariateAnalysis.MultivariateAnalysisSubsection.UMAP) - df = get_test_df() + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = MultivariateAnalysis( verbosity=Verbosity.LOW, subsections=subsections, @@ -437,8 +449,9 @@ def test_imports_verbosity_low_different_subsection_verbosities(): assert set(exported_imports) == set(expected_imports) -def test_show(): - df = get_test_df() +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): + df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = MultivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/tests/test_univariate_analysis_section.py b/tests/test_univariate_analysis_section.py index 3dd0b0f..76e0ce8 100644 --- a/tests/test_univariate_analysis_section.py +++ b/tests/test_univariate_analysis_section.py @@ -9,6 +9,16 @@ from edvart.report_sections.code_string_formatting import code_dedent, get_code from edvart.report_sections.section_base import Verbosity +from .pyarrow_utils import pyarrow_parameterize + + +def get_test_df(pyarrow_dtypes: bool) -> pd.DataFrame: + test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) + if pyarrow_dtypes: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") + + return test_df + def test_invalid_verbosity(): with pytest.raises(ValueError): @@ -21,8 +31,9 @@ def test_invalid_verbosity(): univariate_analysis.UnivariateAnalysis(verbosity="1") -def test_code_export_verbosity_low(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_code_export_verbosity_low(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.LOW) # Export code @@ -36,8 +47,9 @@ def test_code_export_verbosity_low(): assert exported_code[0] == expected_code[0], "Exported code mismatch" -def test_code_export_verbosity_medium(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_code_export_verbosity_medium(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.MEDIUM) # Export code @@ -55,8 +67,9 @@ def test_code_export_verbosity_medium(): assert exported_code[i] == expected_code[i], "Exported code mismatch" -def test_code_export_verbosity_high(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_code_export_verbosity_high(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.HIGH) # Export code @@ -105,8 +118,9 @@ def test_code_export_verbosity_high(): assert exported_code[i] == expected_code[i], "Exported code mismatch" -def test_show(): - test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) +@pyarrow_parameterize +def test_show(pyarrow_dtypes: bool): + test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) univariate_section = univariate_analysis.UnivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7eb4856..aad6be5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,9 +7,14 @@ from edvart import utils +from .pyarrow_utils import pyarrow_parameterize -def test_full_na_series(): + +@pyarrow_parameterize +def test_full_na_series(pyarrow_dtypes: bool): series = pd.Series([None, np.nan, None]) + if pyarrow_dtypes: + series = series.convert_dtypes(dtype_backend="pyarrow") for func in ( utils.quartile1, utils.median,