From 9032158187f144d29c8d4a294b19b7554d8f3fca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= Date: Mon, 11 Mar 2024 14:30:19 +0100 Subject: [PATCH 1/2] test: use fixtures (#207) --- tests/pyarrow_utils.py | 6 +- tests/test_bivariate_analysis.py | 32 ++++----- tests/test_group_analysis.py | 104 ++++++++++++---------------- tests/test_multivariate_analysis.py | 52 ++++++-------- tests/test_overview.py | 27 ++++---- tests/test_report.py | 30 ++++---- tests/test_timeseries_analysis.py | 33 ++++----- tests/test_univariate_analysis.py | 23 +++--- tests/test_utils.py | 5 +- 9 files changed, 139 insertions(+), 173 deletions(-) diff --git a/tests/pyarrow_utils.py b/tests/pyarrow_utils.py index 6bbc8fe..c31372c 100644 --- a/tests/pyarrow_utils.py +++ b/tests/pyarrow_utils.py @@ -1,8 +1,6 @@ -import pytest - from edvart.data_types import PYARROW_PANDAS_BACKEND_AVAILABLE if PYARROW_PANDAS_BACKEND_AVAILABLE: - pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False, True]) + pyarrow_params = [True, False] else: - pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False]) + pyarrow_params = [False] diff --git a/tests/test_bivariate_analysis.py b/tests/test_bivariate_analysis.py index 0f52775..31a833f 100644 --- a/tests/test_bivariate_analysis.py +++ b/tests/test_bivariate_analysis.py @@ -11,12 +11,13 @@ from edvart.report_sections.section_base import Verbosity from .execution_utils import check_section_executes -from .pyarrow_utils import pyarrow_parameterize +from .pyarrow_utils import pyarrow_params -def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: +@pytest.fixture(params=pyarrow_params) +def test_df(request) -> pd.DataFrame: test_df = pd.DataFrame(data=[[1.1, "a"], [2.2, "b"], [3.3, "c"]], columns=["A", "B"]) - if pyarrow_dtypes: + if request.param: test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -125,7 +126,7 @@ def test_section_adding(): ), "Subsection should be ContingencyTable" -def test_code_export_verbosity_low(): +def test_code_export_verbosity_low(test_df: pd.DataFrame): bivariate_section = bivariate_analysis.BivariateAnalysis(verbosity=Verbosity.LOW) # Export code exported_cells = [] @@ -138,10 +139,10 @@ def test_code_export_verbosity_low(): assert len(exported_code) == 1 assert exported_code[0] == expected_code[0], "Exported code mismatch" - check_section_executes(bivariate_section, df=get_test_df()) + check_section_executes(bivariate_section, df=test_df) -def test_code_export_verbosity_low_with_subsections(): +def test_code_export_verbosity_low_with_subsections(test_df: pd.DataFrame): bivariate_section = bivariate_analysis.BivariateAnalysis( subsections=[ BivariateAnalysisSubsection.ContingencyTable, @@ -164,7 +165,7 @@ def test_code_export_verbosity_low_with_subsections(): assert len(exported_code) == 1 assert exported_code[0] == expected_code[0], "Exported code mismatch" - check_section_executes(bivariate_section, df=get_test_df()) + check_section_executes(bivariate_section, df=test_df) def test_generated_code_verbosity_low_columns(): @@ -209,7 +210,7 @@ def test_generated_code_verbosity_low_columns(): check_section_executes(bivariate_section, df=test_df) -def test_generated_code_verbosity_medium(): +def test_generated_code_verbosity_medium(test_df: pd.DataFrame): bivariate_section = bivariate_analysis.BivariateAnalysis( verbosity=Verbosity.MEDIUM, subsections=[ @@ -233,7 +234,7 @@ def test_generated_code_verbosity_medium(): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(bivariate_section, df=get_test_df()) + check_section_executes(bivariate_section, df=test_df) def test_generated_code_verbosity_medium_columns_x_y(): @@ -307,7 +308,7 @@ def test_generated_code_verbosity_medium_columns_pairs(): check_section_executes(bivariate_section, df=test_df) -def test_generated_code_verbosity_high(): +def test_generated_code_verbosity_high(test_df: pd.DataFrame): bivariate_section = bivariate_analysis.BivariateAnalysis( verbosity=Verbosity.HIGH, subsections=[ @@ -345,10 +346,10 @@ def test_generated_code_verbosity_high(): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(bivariate_section, df=get_test_df()) + check_section_executes(bivariate_section, df=test_df) -def test_verbosity_low_different_subsection_verbosities(): +def test_verbosity_low_different_subsection_verbosities(test_df: pd.DataFrame): bivariate_section = BivariateAnalysis( verbosity=Verbosity.LOW, subsections=[ @@ -377,7 +378,7 @@ def test_verbosity_low_different_subsection_verbosities(): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(bivariate_section, df=get_test_df()) + check_section_executes(bivariate_section, df=test_df) def test_imports_verbosity_low(): @@ -449,10 +450,9 @@ def test_imports_verbosity_low_different_subsection_verbosities(): assert set(exported_imports) == set(expected_imports) -@pyarrow_parameterize -def test_show(pyarrow_dtypes: bool): +def test_show(test_df: pd.DataFrame): bivariate_section = BivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) with redirect_stdout(None): - bivariate_section.show(get_test_df(pyarrow_dtypes=pyarrow_dtypes)) + bivariate_section.show(test_df) diff --git a/tests/test_group_analysis.py b/tests/test_group_analysis.py index eeb6c67..ddb98ba 100644 --- a/tests/test_group_analysis.py +++ b/tests/test_group_analysis.py @@ -22,13 +22,14 @@ from edvart.report_sections.section_base import Verbosity from .execution_utils import check_section_executes -from .pyarrow_utils import pyarrow_parameterize +from .pyarrow_utils import pyarrow_params # Workaround to prevent multiple browser tabs opening with figures plotly.io.renderers.default = "json" -def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: +@pytest.fixture(params=pyarrow_params) +def test_df(request) -> pd.DataFrame: test_df = pd.DataFrame( data=[ ["P" if np.random.uniform() < 0.4 else "N", 1.5 * i, "X" if i % 2 == 0 else "Y"] @@ -36,7 +37,7 @@ def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: ], columns=["A", "B", "C"], ) - if pyarrow_dtypes: + if request.param: test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -53,51 +54,44 @@ def test_invalid_verbosities(): GroupAnalysis(groupby=[], verbosity=-1) -@pyarrow_parameterize -def test_groupby_nonexistent_col(pyarrow_dtypes: bool): +def test_groupby_nonexistent_col(test_df: pd.DataFrame): with pytest.raises(ValueError): - show_group_analysis(df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"]) + show_group_analysis(df=test_df, groupby=["non-existent"]) with pytest.raises(ValueError): - group_missing_values( - df=get_test_df(pyarrow_dtypes=pyarrow_dtypes), groupby=["non-existent"] - ) + group_missing_values(df=test_df, groupby=["non-existent"]) -@pyarrow_parameterize -def test_static_methods(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_static_methods(test_df: pd.DataFrame): with redirect_stdout(None): - show_group_analysis(df=df, groupby="C") - show_group_analysis(df=df, groupby=["C"], columns=["A"]) - show_group_analysis(df=df, groupby=["C"], columns=["A", "B"]) - show_group_analysis(df=df, groupby="C", columns=["A", "B", "C"]) - show_group_analysis(df=df, groupby="C", columns=["C"]) - - group_barplot(df, groupby=["A"], column="B") - group_barplot(df, groupby=["A"], column="A") - group_barplot(df, groupby=["A", "C"], column="B") - group_barplot(df, groupby=["A"], column="C") - group_barplot(df, groupby=["A"], column="C") - - group_missing_values(df, groupby=["C"]) - group_missing_values(df, groupby=["C"], columns=["A", "B"]) - group_missing_values(df, groupby=["C"], columns=["A", "B", "C"]) - group_missing_values(df, groupby=["C"], columns=["C"]) - - overlaid_histograms(df, groupby=["A"], column="B") - overlaid_histograms(df, groupby=["A", "C"], column="B") - overlaid_histograms(df, groupby=["A", "C"], column="B") - overlaid_histograms(df, groupby=["B"], column="B") - - -@pyarrow_parameterize -def test_code_export_verbosity_low(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) + show_group_analysis(df=test_df, groupby="C") + show_group_analysis(df=test_df, groupby=["C"], columns=["A"]) + show_group_analysis(df=test_df, groupby=["C"], columns=["A", "B"]) + show_group_analysis(df=test_df, groupby="C", columns=["A", "B", "C"]) + show_group_analysis(df=test_df, groupby="C", columns=["C"]) + + group_barplot(test_df, groupby=["A"], column="B") + group_barplot(test_df, groupby=["A"], column="A") + group_barplot(test_df, groupby=["A", "C"], column="B") + group_barplot(test_df, groupby=["A"], column="C") + group_barplot(test_df, groupby=["A"], column="C") + + group_missing_values(test_df, groupby=["C"]) + group_missing_values(test_df, groupby=["C"], columns=["A", "B"]) + group_missing_values(test_df, groupby=["C"], columns=["A", "B", "C"]) + group_missing_values(test_df, groupby=["C"], columns=["C"]) + + overlaid_histograms(test_df, groupby=["A"], column="B") + overlaid_histograms(test_df, groupby=["A", "C"], column="B") + overlaid_histograms(test_df, groupby=["A", "C"], column="B") + overlaid_histograms(test_df, groupby=["B"], column="B") + + +def test_code_export_verbosity_low(test_df: pd.DataFrame): group_section = GroupAnalysis(groupby="B", verbosity=Verbosity.LOW) # Export code exported_cells = [] - group_section.add_cells(exported_cells, df=df) + group_section.add_cells(exported_cells, df=test_df) # Remove markdown and other cells and get code strings exported_code = [cell["source"] for cell in exported_cells if cell["cell_type"] == "code"] # Define expected code @@ -106,17 +100,15 @@ def test_code_export_verbosity_low(pyarrow_dtypes: bool): assert len(exported_code) == 1 assert exported_code[0] == expected_code[0], "Exported code mismatch" - check_section_executes(group_section, df) + check_section_executes(group_section, test_df) -@pyarrow_parameterize -def test_code_export_verbosity_medium(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_code_export_verbosity_medium(test_df: pd.DataFrame): group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.MEDIUM) # Export code exported_cells = [] - group_section.add_cells(exported_cells, df=df) + group_section.add_cells(exported_cells, df=test_df) # Remove markdown and other cells and get code strings exported_code = [cell["source"] for cell in exported_cells if cell["cell_type"] == "code"] # Define expected code @@ -135,17 +127,15 @@ def test_code_export_verbosity_medium(pyarrow_dtypes: bool): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(group_section, df) + check_section_executes(group_section, test_df) -@pyarrow_parameterize -def test_code_export_verbosity_high(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_code_export_verbosity_high(test_df: pd.DataFrame): group_section = GroupAnalysis(groupby="A", verbosity=Verbosity.HIGH) # Export code exported_cells = [] - group_section.add_cells(exported_cells, df=df) + group_section.add_cells(exported_cells, df=test_df) # Remove markdown and other cells and get code strings exported_code = [cell["source"] for cell in exported_cells if cell["cell_type"] == "code"] # Define expected code @@ -192,12 +182,10 @@ def test_code_export_verbosity_high(pyarrow_dtypes: bool): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(group_section, df) + check_section_executes(group_section, test_df) -@pyarrow_parameterize -def test_columns_parameter(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_columns_parameter(test_df: pd.DataFrame): ga = GroupAnalysis(groupby="A", columns=["B"]) assert ga.groupby == ["A"] assert ga.columns == ["B"] @@ -205,8 +193,8 @@ def test_columns_parameter(pyarrow_dtypes: bool): ga = GroupAnalysis(groupby="A") assert ga.groupby == ["A"] assert ga.columns is None - ga.show(df) - ga.add_cells([], df=df) + ga.show(test_df) + ga.add_cells([], df=test_df) assert ga.groupby == ["A"] assert ga.columns is None @@ -217,11 +205,9 @@ def test_column_list_not_modified(): assert columns == ["C"], "Column list modified" -@pyarrow_parameterize -def test_show(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_show(test_df: pd.DataFrame): group_section = GroupAnalysis(groupby="A") with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) with redirect_stdout(None): - group_section.show(df) + group_section.show(test_df) diff --git a/tests/test_multivariate_analysis.py b/tests/test_multivariate_analysis.py index 3a480af..0218894 100644 --- a/tests/test_multivariate_analysis.py +++ b/tests/test_multivariate_analysis.py @@ -21,12 +21,13 @@ from edvart.report_sections import umap from .execution_utils import check_section_executes -from .pyarrow_utils import pyarrow_parameterize +from .pyarrow_utils import pyarrow_params pio.renderers.default = "json" -def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: +@pytest.fixture(params=pyarrow_params) +def test_df(request) -> pd.DataFrame: test_df = pd.DataFrame( data=[ [1.1, "a", 3.7, 3.9], @@ -37,7 +38,7 @@ def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame: ], columns=["A", "B", "C", "D"], ) - if pyarrow_dtypes: + if request.param: test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -136,9 +137,8 @@ def test_section_adding(): ), "Subsection should be UMAP" -@pyarrow_parameterize -def test_code_export_verbosity_low(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_code_export_verbosity_low(test_df: pd.DataFrame): + df = test_df multivariate_section = MultivariateAnalysis(verbosity=Verbosity.LOW) # Export code exported_cells = [] @@ -154,13 +154,12 @@ def test_code_export_verbosity_low(pyarrow_dtypes: bool): check_section_executes(multivariate_section, df) -@pyarrow_parameterize -def test_code_export_verbosity_low_with_subsections(pyarrow_dtypes: bool): +def test_code_export_verbosity_low_with_subsections(test_df: pd.DataFrame): subsec = MultivariateAnalysisSubsection subsections = [subsec.ParallelCategories, subsec.PCA, subsec.ParallelCoordinates, subsec.PCA] if UMAP_AVAILABLE: subsections.append(subsec.UMAP) - df = get_test_df() + df = test_df multivariate_section = multivariate_analysis.MultivariateAnalysis( subsections=subsections, verbosity=Verbosity.LOW ) @@ -195,8 +194,7 @@ def test_code_export_verbosity_low_with_subsections(pyarrow_dtypes: bool): check_section_executes(multivariate_section, df) -@pyarrow_parameterize -def test_code_export_verbosity_medium_all_cols_valid(pyarrow_dtypes: bool): +def test_code_export_verbosity_medium_all_cols_valid(): all_numeric_df = pd.DataFrame( data=[[1.1, 1, -2], [2.2, 2, -5.3], [3.3, 3, 4]], columns=["col1", "col2", "col3"] ) @@ -226,13 +224,11 @@ def test_code_export_verbosity_medium_all_cols_valid(pyarrow_dtypes: bool): check_section_executes(multivariate_section, all_numeric_df) -@pyarrow_parameterize -def test_generated_code_verbosity_1(pyarrow_dtypes: bool): +def test_generated_code_verbosity_1(test_df: pd.DataFrame): multivariate_section = MultivariateAnalysis(verbosity=Verbosity.MEDIUM) - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) exported_cells = [] - multivariate_section.add_cells(exported_cells, df=df) + multivariate_section.add_cells(exported_cells, df=test_df) exported_code = [cell["source"] for cell in exported_cells if cell["cell_type"] == "code"] if UMAP_AVAILABLE: expected_code = [ @@ -262,16 +258,14 @@ def test_generated_code_verbosity_1(pyarrow_dtypes: bool): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(multivariate_section, df) + check_section_executes(multivariate_section, test_df) -@pyarrow_parameterize -def test_generated_code_verbosity_2(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_generated_code_verbosity_2(test_df: pd.DataFrame): multivariate_section = MultivariateAnalysis(verbosity=Verbosity.HIGH) multivariate_cells = [] - multivariate_section.add_cells(multivariate_cells, df=df) + multivariate_section.add_cells(multivariate_cells, df=test_df) exported_code = [cell["source"] for cell in multivariate_cells if cell["cell_type"] == "code"] expected_code = [ get_code(select_numeric_columns), @@ -330,10 +324,10 @@ def test_generated_code_verbosity_2(pyarrow_dtypes: bool): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(multivariate_section, df) + check_section_executes(multivariate_section, test_df) -@pyarrow_parameterize +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) def test_verbosity_medium_non_categorical_col(pyarrow_dtypes: bool): random_array = np.random.randint(low=1, high=40, size=(100, 3)) random_df = pd.DataFrame(data=random_array, columns=["integral", "floating", "cat"]) @@ -358,8 +352,7 @@ def test_verbosity_medium_non_categorical_col(pyarrow_dtypes: bool): check_section_executes(multivariate_section, random_df) -@pyarrow_parameterize -def test_verbosity_low_different_subsection_verbosities(pyarrow_dtypes: bool): +def test_verbosity_low_different_subsection_verbosities(test_df: pd.DataFrame): subsections = [ MultivariateAnalysisSubsection.PCA, MultivariateAnalysisSubsection.PCA, @@ -368,7 +361,6 @@ def test_verbosity_low_different_subsection_verbosities(pyarrow_dtypes: bool): ] if UMAP_AVAILABLE: subsections.insert(2, MultivariateAnalysisSubsection.UMAP) - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) multivariate_section = MultivariateAnalysis( verbosity=Verbosity.LOW, subsections=subsections, @@ -377,7 +369,7 @@ def test_verbosity_low_different_subsection_verbosities(pyarrow_dtypes: bool): ) multivariate_cells = [] - multivariate_section.add_cells(multivariate_cells, df=df) + multivariate_section.add_cells(multivariate_cells, df=test_df) exported_code = [cell["source"] for cell in multivariate_cells if cell["cell_type"] == "code"] expected_subsections = [ "MultivariateAnalysisSubsection.PCA", @@ -404,7 +396,7 @@ def test_verbosity_low_different_subsection_verbosities(pyarrow_dtypes: bool): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(multivariate_section, df) + check_section_executes(multivariate_section, test_df) def test_imports_verbosity_low(): @@ -479,11 +471,9 @@ def test_imports_verbosity_low_different_subsection_verbosities(): assert set(exported_imports) == set(expected_imports) -@pyarrow_parameterize -def test_show(pyarrow_dtypes: bool): - df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_show(test_df: pd.DataFrame): multivariate_section = MultivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) with redirect_stdout(None): - multivariate_section.show(df) + multivariate_section.show(test_df) diff --git a/tests/test_overview.py b/tests/test_overview.py index a1ae9bf..34338f6 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -23,7 +23,8 @@ from .execution_utils import check_section_executes -def get_test_df() -> pd.DataFrame: +@pytest.fixture +def test_df() -> pd.DataFrame: test_df = pd.DataFrame(data=[[1.1, "a"], [2.2, "b"], [3.3, "c"]], columns=["A", "B"]) return test_df @@ -126,7 +127,7 @@ def test_section_adding(): ), "Subsection should be DuplicateRows" -def test_code_export_verbosity_low(): +def test_code_export_verbosity_low(test_df: pd.DataFrame): overview_section = Overview(verbosity=Verbosity.LOW) # Export code exported_cells = [] @@ -138,10 +139,10 @@ def test_code_export_verbosity_low(): # Test code equivalence assert exported_code[0] == expected_code[0], "Exported code mismatch" - check_section_executes(overview_section, df=get_test_df()) + check_section_executes(overview_section, df=test_df) -def test_code_export_verbosity_low_with_subsections(): +def test_code_export_verbosity_low_with_subsections(test_df: pd.DataFrame): overview_section = Overview( subsections=[ OverviewSubsection.QuickInfo, @@ -162,10 +163,10 @@ def test_code_export_verbosity_low_with_subsections(): # Test code equivalence assert exported_code[0] == expected_code[0], "Exported code mismatch" - check_section_executes(overview_section, df=get_test_df()) + check_section_executes(overview_section, df=test_df) -def test_code_export_verbosity_medium(): +def test_code_export_verbosity_medium(test_df: pd.DataFrame): # Construct overview section overview_section = Overview( subsections=[ @@ -198,10 +199,10 @@ def test_code_export_verbosity_medium(): for i in range(len(exported_code)): assert exported_code[i] == expected_code[i], "Exported code mismatch" - check_section_executes(overview_section, df=get_test_df()) + check_section_executes(overview_section, df=test_df) -def test_code_export_verbosity_high(): +def test_code_export_verbosity_high(test_df: pd.DataFrame): # Construct overview section overview_section = Overview( subsections=[ @@ -278,10 +279,10 @@ def test_code_export_verbosity_high(): for i in range(len(exported_code)): assert exported_code[i] == expected_code[i], "Exported code mismatch" - check_section_executes(overview_section, df=get_test_df()) + check_section_executes(overview_section, df=test_df) -def test_verbosity_low_different_subsection_verbosities(): +def test_verbosity_low_different_subsection_verbosities(test_df: pd.DataFrame): overview_section = Overview( verbosity=Verbosity.LOW, verbosity_quick_info=Verbosity.MEDIUM, @@ -313,7 +314,7 @@ def test_verbosity_low_different_subsection_verbosities(): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(overview_section, df=get_test_df()) + check_section_executes(overview_section, df=test_df) def test_imports_verbosity_low(): @@ -377,9 +378,9 @@ def test_imports_verbosity_low_different_subsection_verbosities(): assert set(exported_imports) == set(expected_imports) -def test_show(): +def test_show(test_df: pd.DataFrame): overview_section = Overview() - df = get_test_df() + df = test_df with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) with redirect_stdout(None): diff --git a/tests/test_report.py b/tests/test_report.py index 50f419e..80960d9 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -6,6 +6,7 @@ import nbformat import numpy as np import pandas as pd +import pytest from edvart.report import DefaultReport, ExportDataMode, Report from edvart.report_sections.bivariate_analysis import BivariateAnalysis @@ -13,14 +14,15 @@ from edvart.report_sections.univariate_analysis import UnivariateAnalysis -def _get_test_df() -> pd.DataFrame: +@pytest.fixture +def test_df() -> pd.DataFrame: return pd.DataFrame( data=np.random.random_sample((50, 20)), columns=[f"Col{i}" for i in range(20)] ) -def test_report(): - report = Report(dataframe=_get_test_df()) +def test_report(test_df: pd.DataFrame): + report = Report(dataframe=test_df) assert len(report.sections) == 0, "Report should be empty" report.add_overview(verbosity=Verbosity.MEDIUM) @@ -36,11 +38,11 @@ def test_report(): assert report.sections[1].columns == ["Col1", "Col2", "Col3"], "Wrong columns" -def test_add_section(): +def test_add_section(test_df: pd.DataFrame): bivariate_analysis_section = BivariateAnalysis() univariate_analysis_section = UnivariateAnalysis() report = ( - Report(dataframe=_get_test_df()) + Report(dataframe=test_df) .add_section(bivariate_analysis_section) .add_section(univariate_analysis_section) ) @@ -48,9 +50,9 @@ def test_add_section(): assert report.sections == [bivariate_analysis_section, univariate_analysis_section] -def test_default_report(): +def test_default_report(test_df: pd.DataFrame): report = DefaultReport( - dataframe=_get_test_df(), + dataframe=test_df, verbosity_overview=Verbosity.MEDIUM, verbosity_univariate_analysis=Verbosity.HIGH, columns_bivariate_analysis=["Col1", "Col2", "Col3"], @@ -67,8 +69,7 @@ def test_default_report(): assert report.sections[2].columns == ["Col1", "Col2", "Col3"], "Wrong columns" -def test_column_selection(): - test_df = _get_test_df() +def test_column_selection(test_df: pd.DataFrame): report = Report(dataframe=test_df) # Default column selection @@ -85,8 +86,7 @@ def test_column_selection(): assert set(report.sections[2].columns) == {"Col5", "Col7", "Col13"}, "Wrong column selection" -def test_show(): - test_df = _get_test_df() +def test_show(test_df: pd.DataFrame): report = Report(dataframe=test_df) with warnings.catch_warnings(): @@ -95,8 +95,8 @@ def test_show(): report.show() -def test_notebook_export(tmp_path: pathlib.Path): - report = Report(dataframe=_get_test_df()) +def test_notebook_export(tmp_path: pathlib.Path, test_df: pd.DataFrame): + report = Report(dataframe=test_df) report.add_overview() for export_data_mode in ( @@ -112,8 +112,8 @@ def test_notebook_export(tmp_path: pathlib.Path): ) -def test_exported_notebook_executes(tmp_path: pathlib.Path): - report = Report(dataframe=_get_test_df()) +def test_exported_notebook_executes(tmp_path: pathlib.Path, test_df: pd.DataFrame): + report = Report(dataframe=test_df) report.add_overview() for export_data_mode in (ExportDataMode.EMBED, ExportDataMode.FILE): diff --git a/tests/test_timeseries_analysis.py b/tests/test_timeseries_analysis.py index 33757f5..fd60e09 100644 --- a/tests/test_timeseries_analysis.py +++ b/tests/test_timeseries_analysis.py @@ -22,7 +22,8 @@ pio.renderers.default = "json" -def get_test_df() -> pd.DataFrame: +@pytest.fixture +def test_df() -> pd.DataFrame: n_rows = 20 columns = ["a", "b", "c"] return pd.DataFrame( @@ -185,9 +186,9 @@ def test_ft_no_sampling_rate_error(): ) -def test_code_export_verbosity_low(): +def test_code_export_verbosity_low(test_df: pd.DataFrame): ts_section = TimeseriesAnalysis(verbosity=Verbosity.LOW) - test_df = get_test_df() + test_df = test_df # Export code exported_cells = [] ts_section.add_cells(exported_cells, df=test_df) @@ -202,7 +203,7 @@ def test_code_export_verbosity_low(): check_section_executes(ts_section, test_df) -def test_code_export_verbosity_low_with_subsections(): +def test_code_export_verbosity_low_with_subsections(test_df: pd.DataFrame): ts_section = TimeseriesAnalysis( subsections=[ TimeseriesAnalysisSubsection.RollingStatistics, @@ -210,10 +211,10 @@ def test_code_export_verbosity_low_with_subsections(): ], verbosity=Verbosity.LOW, ) - test_df = get_test_df() + test_df = test_df # Export code exported_cells = [] - ts_section.add_cells(exported_cells, df=get_test_df()) + ts_section.add_cells(exported_cells, df=test_df) # Remove markdown and other cells and get code strings exported_code = [cell["source"] for cell in exported_cells if cell["cell_type"] == "code"] # Define expected code @@ -229,7 +230,7 @@ def test_code_export_verbosity_low_with_subsections(): check_section_executes(ts_section, test_df) -def test_code_export_verbosity_low_with_fft_stft(): +def test_code_export_verbosity_low_with_fft_stft(test_df: pd.DataFrame): ts_section = TimeseriesAnalysis( subsections=[ TimeseriesAnalysisSubsection.FourierTransform, @@ -239,7 +240,6 @@ def test_code_export_verbosity_low_with_fft_stft(): sampling_rate=1, stft_window_size=1, ) - test_df = get_test_df() # Export code exported_cells = [] ts_section.add_cells(exported_cells, df=test_df) @@ -259,9 +259,8 @@ def test_code_export_verbosity_low_with_fft_stft(): check_section_executes(ts_section, test_df) -def test_generated_code_verbosity_medium(): +def test_generated_code_verbosity_medium(test_df: pd.DataFrame): ts_section = TimeseriesAnalysis(verbosity=Verbosity.MEDIUM) - test_df = get_test_df() exported_cells = [] ts_section.add_cells(exported_cells, df=test_df) @@ -283,8 +282,7 @@ def test_generated_code_verbosity_medium(): check_section_executes(ts_section, test_df) -def test_generated_code_verbosity_high(): - test_df = get_test_df() +def test_generated_code_verbosity_high(test_df: pd.DataFrame): ts_section = TimeseriesAnalysis(verbosity=Verbosity.HIGH, sampling_rate=1, stft_window_size=1) pairplot_cells = [] @@ -354,8 +352,7 @@ def test_generated_code_verbosity_high(): check_section_executes(ts_section, test_df) -def test_verbosity_low_different_subsection_verbosities(): - test_df = get_test_df() +def test_verbosity_low_different_subsection_verbosities(test_df: pd.DataFrame): ts_section = TimeseriesAnalysis( verbosity=Verbosity.LOW, subsections=[ @@ -396,7 +393,7 @@ def test_verbosity_low_different_subsection_verbosities(): assert expected_line == exported_line, "Exported code mismatch" -def test_boxplots_over_time_def(): +def test_boxplots_over_time_def(test_df: pd.DataFrame): def month_func(x: datetime) -> str: return str(x.month) @@ -420,10 +417,10 @@ def month_func(x: datetime) -> str: for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(boxplots_sub, get_test_df()) + check_section_executes(boxplots_sub, test_df) -def test_boxplots_over_time_lambda(): +def test_boxplots_over_time_lambda(test_df: pd.DataFrame): month_lambda = lambda x: x.month # noqa: E731 boxplots_sub = BoxplotsOverTime(grouping_name="Month", grouping_function=month_lambda) @@ -443,7 +440,7 @@ def test_boxplots_over_time_lambda(): for expected_line, exported_line in zip(expected_code, exported_code): assert expected_line == exported_line, "Exported code mismatch" - check_section_executes(boxplots_sub, get_test_df()) + check_section_executes(boxplots_sub, test_df) def test_imports_verbosity_low(): diff --git a/tests/test_univariate_analysis.py b/tests/test_univariate_analysis.py index f69a4d9..d5cf6ea 100644 --- a/tests/test_univariate_analysis.py +++ b/tests/test_univariate_analysis.py @@ -10,12 +10,13 @@ from edvart.report_sections.section_base import Verbosity from .execution_utils import check_section_executes -from .pyarrow_utils import pyarrow_parameterize +from .pyarrow_utils import pyarrow_params -def get_test_df(pyarrow_dtypes: bool) -> pd.DataFrame: +@pytest.fixture(params=pyarrow_params) +def test_df(request) -> pd.DataFrame: test_df = pd.DataFrame(data=[[1.9, "a"], [2.1, "b"], [3.3, "c"]], columns=["A", "B"]) - if pyarrow_dtypes: + if request.param: test_df = test_df.convert_dtypes(dtype_backend="pyarrow") return test_df @@ -32,9 +33,7 @@ def test_invalid_verbosity(): univariate_analysis.UnivariateAnalysis(verbosity="1") -@pyarrow_parameterize -def test_code_export_verbosity_low(pyarrow_dtypes: bool): - test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_code_export_verbosity_low(test_df: pd.DataFrame): # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.LOW) # Export code @@ -50,9 +49,7 @@ def test_code_export_verbosity_low(pyarrow_dtypes: bool): check_section_executes(univariate_section, test_df) -@pyarrow_parameterize -def test_code_export_verbosity_medium(pyarrow_dtypes: bool): - test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_code_export_verbosity_medium(test_df: pd.DataFrame): # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.MEDIUM) # Export code @@ -72,9 +69,7 @@ def test_code_export_verbosity_medium(pyarrow_dtypes: bool): check_section_executes(univariate_section, test_df) -@pyarrow_parameterize -def test_code_export_verbosity_high(pyarrow_dtypes: bool): - test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_code_export_verbosity_high(test_df: pd.DataFrame): # Construct univariate analysis section univariate_section = univariate_analysis.UnivariateAnalysis(verbosity=Verbosity.HIGH) # Export code @@ -125,9 +120,7 @@ def test_code_export_verbosity_high(pyarrow_dtypes: bool): check_section_executes(univariate_section, test_df) -@pyarrow_parameterize -def test_show(pyarrow_dtypes: bool): - test_df = get_test_df(pyarrow_dtypes=pyarrow_dtypes) +def test_show(test_df: pd.DataFrame): univariate_section = univariate_analysis.UnivariateAnalysis() with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/tests/test_utils.py b/tests/test_utils.py index aad6be5..d097993 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,13 +4,14 @@ import numpy as np import pandas as pd +import pytest from edvart import utils -from .pyarrow_utils import pyarrow_parameterize +from .pyarrow_utils import pyarrow_params -@pyarrow_parameterize +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) def test_full_na_series(pyarrow_dtypes: bool): series = pd.Series([None, np.nan, None]) if pyarrow_dtypes: From e2889d4820d7af2233664e1540bceaeffe082c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Bel=C3=A1k?= Date: Tue, 12 Mar 2024 16:34:14 +0100 Subject: [PATCH 2/2] test: test data type inference with pyarrow dtypes (#215) --- tests/test_data_type_inference.py | 211 ++++++++++++++++-------------- 1 file changed, 112 insertions(+), 99 deletions(-) diff --git a/tests/test_data_type_inference.py b/tests/test_data_type_inference.py index aedc9f2..4095fe9 100644 --- a/tests/test_data_type_inference.py +++ b/tests/test_data_type_inference.py @@ -1,115 +1,128 @@ import numpy as np import pandas as pd +import pytest from edvart import data_types +from .pyarrow_utils import pyarrow_params -def test_inference(): - assert ( - data_types.infer_data_type(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312])) - == data_types.DataType.NUMERIC - ), "Should be numeric type" - assert ( - data_types.infer_data_type( - pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) - ) - == data_types.DataType.DATE - ), "Should be date type" - assert ( - data_types.infer_data_type(pd.Series(["A", "B", "C", "C", "A", "B"])) - == data_types.DataType.CATEGORICAL - ), "Should be categorical type" - assert ( - data_types.infer_data_type(pd.Series([True, False, False, True, True])) - == data_types.DataType.BOOLEAN - ), "Should be boolean type" - assert data_types.infer_data_type( - pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING - ), "Should be missing" - assert ( - data_types.infer_data_type(pd.Series(list(range(10)))) == data_types.DataType.UNIQUE - ), "Should be unique" - assert ( - data_types.infer_data_type(pd.Series([1] + list(range(100)))) == data_types.DataType.NUMERIC - ), "Should be numeric" - assert ( - data_types.infer_data_type(pd.Series(dtype=pd.Float64Dtype)) == data_types.DataType.UNKNOWN - ), "Should be unknown" - assert data_types.infer_data_type( - pd.Series([True, False]) == data_types.DataType.BOOLEAN - ), "Should be boolean" +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) +@pytest.mark.parametrize( + "data, expected", + [ + (pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), data_types.DataType.NUMERIC), + ( + pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]), + data_types.DataType.DATE, + ), + (pd.Series(["A", "B", "C", "C", "A", "B"]), data_types.DataType.CATEGORICAL), + (pd.Series([True, False, False, True, True]), data_types.DataType.BOOLEAN), + (pd.Series([None, None, np.nan, float("nan")]), data_types.DataType.MISSING), + (pd.Series(list(range(10))), data_types.DataType.UNIQUE), + (pd.Series([1] + list(range(100))), data_types.DataType.NUMERIC), + (pd.Series(dtype=pd.Float64Dtype), data_types.DataType.UNKNOWN), + (pd.Series([True, False]), data_types.DataType.BOOLEAN), + ], +) +def test_inference(data, expected, pyarrow_dtypes): + if pyarrow_dtypes: + data = data.convert_dtypes(dtype_backend="pyarrow") + assert data_types.infer_data_type(data) == expected -def test_missing_series(): - assert data_types.is_missing(pd.Series([None, None, np.nan, float("nan")])), "Should be missing" - assert data_types.is_missing(pd.Series([pd.NA])), "Should be missing" - assert not data_types.is_missing(pd.Series([1, np.nan, None])), "Should not be missing" - assert not data_types.is_missing(pd.Series(["2023-01-01", None])), "Should not be missing" +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) +@pytest.mark.parametrize( + "data, is_missing", + [ + (pd.Series([None, None, np.nan, float("nan")]), True), + (pd.Series([pd.NA]), True), + (pd.Series([1, np.nan, None]), False), + (pd.Series(["2023-01-01", None]), False), + ], +) +def test_missing_series(data, is_missing, pyarrow_dtypes): + if pyarrow_dtypes: + data = data.convert_dtypes(dtype_backend="pyarrow") + assert data_types.is_missing(data) == is_missing -def test_numeric_series(): - assert data_types.is_numeric( - pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]) - ), "Should be numeric type" - assert data_types.is_numeric(pd.Series([23, 45, 2, 1.2, -3, -66])), "Should be numeric type" - assert not data_types.is_numeric( - pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"]) - ), "Should not be numeric type" - assert not data_types.is_numeric( - pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]) - ), "Should not be numeric type" - assert not data_types.is_numeric( - pd.Series([None, None, np.nan, float("nan")]) - ), "Should not be numeric" +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) +@pytest.mark.parametrize( + "data, is_numeric", + [ + (pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), True), + (pd.Series([23, 45, 2, 1.2, -3, -66]), True), + (pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"]), False), + (pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]), False), + (pd.Series([None, None, np.nan, float("nan")]), False), + ], +) +def test_numeric_series(data, is_numeric, pyarrow_dtypes): + if pyarrow_dtypes: + data = data.convert_dtypes(dtype_backend="pyarrow") + assert data_types.is_numeric(data) == is_numeric -def test_categorical_series(): - assert data_types.is_categorical(pd.Series(["A", "B", "C", "D"])), "Should be categorical" - assert data_types.is_categorical( - pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4]) - ), "Should be categorical" - assert not data_types.is_categorical( - pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]) - ), "Should not be categorical" - assert not data_types.is_categorical( - pd.Series([None, None, np.nan, float("nan")]) - ), "Should not be categorical" - assert not data_types.is_categorical(pd.Series([pd.NA])), "Should not be categorical" +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) +@pytest.mark.parametrize( + "data, is_categorical", + [ + (pd.Series(["A", "B", "C", "D"]), True), + (pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4]), True), + ( + pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]), + False, + ), + (pd.Series([None, None, np.nan, float("nan")]), False), + (pd.Series([pd.NA]), False), + ], +) +def test_categorical_series(data, is_categorical, pyarrow_dtypes): + if pyarrow_dtypes: + data = data.convert_dtypes(dtype_backend="pyarrow") + assert data_types.is_categorical(data) == is_categorical -def test_boolean_series(): - assert data_types.is_boolean(pd.Series([True, False, False, True, True])), "Should be boolean" - assert data_types.is_boolean(pd.Series([False, False, False])), "Should be boolean" - assert data_types.is_boolean(pd.Series([True, True, True])), "Should be boolean" - assert data_types.is_boolean(pd.Series([1, 0, 0, 1])), "Should be boolean" - assert data_types.is_boolean(pd.Series([0, 0, 0, 0])), "Should be boolean" - assert data_types.is_boolean(pd.Series([1, 1, 1, 1])), "Should be boolean" - assert not data_types.is_boolean( - pd.Series([True, False, False, True, True, "True"]) - ), "Should not be boolean" - assert not data_types.is_boolean(pd.Series([2, 2, 2, 2])), "Should not be boolean" - assert not data_types.is_boolean(pd.Series([1, 0, 0, 1, 3])), "Should not be boolean" - assert not data_types.is_boolean(pd.Series(["a", "abc", "2"])), "Should not be boolean" - assert not data_types.is_boolean(pd.Series(["A", "B", "A", "A", "B"])), "Should not be boolean" - assert not data_types.is_boolean(pd.Series([-0.2, 1.6567, 3, 4, 5])), "Should not be boolean" - assert not data_types.is_boolean(pd.Series([None])), "Should not be boolean" +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) +@pytest.mark.parametrize( + "data, is_boolean", + [ + (pd.Series([True, False, False, True, True]), True), + (pd.Series([False, False, False]), True), + (pd.Series([True, True, True]), True), + (pd.Series([1, 0, 0, 1]), True), + (pd.Series([0, 0, 0, 0]), True), + (pd.Series([1, 1, 1, 1]), True), + (pd.Series([True, False, False, True, True, "True"]), False), + (pd.Series([2, 2, 2, 2]), False), + (pd.Series([1, 0, 0, 1, 3]), False), + (pd.Series(["a", "abc", "2"]), False), + (pd.Series(["A", "B", "A", "A", "B"]), False), + (pd.Series([-0.2, 1.6567, 3, 4, 5]), False), + (pd.Series([None]), False), + ], +) +def test_boolean_series(data, is_boolean, pyarrow_dtypes): + if pyarrow_dtypes: + data = data.convert_dtypes(dtype_backend="pyarrow") + assert data_types.is_boolean(data) == is_boolean -def test_date_series(): - assert data_types.is_date( - pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) - ), "Should be type date" - assert data_types.is_date( - pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]) - ), "Should be type date" - assert not data_types.is_date( - pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]) - ), "Should not be type date" - assert not data_types.is_date( - pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]) - ), "Should not be type date" - assert not data_types.is_date(pd.Series([1, 2, 3, 4, 5])), "Should not be type date" - assert not data_types.is_date(pd.Series([None, 2.0, 3, 4, 5])), "Should not be type date" - assert data_types.is_date( - pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None]) - ), "Should be type date" + +@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) +@pytest.mark.parametrize( + "data, is_date", + [ + (pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]), True), + (pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]), True), + (pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]), False), + (pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]), False), + (pd.Series([1, 2, 3, 4, 5]), False), + (pd.Series([None, 2.0, 3, 4, 5]), False), + (pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None]), True), + ], +) +def test_date_series(data, is_date, pyarrow_dtypes): + if pyarrow_dtypes: + data = data.convert_dtypes(dtype_backend="pyarrow") + assert data_types.is_date(data) == is_date