Merge remote-tracking branch 'origin/main' into refactor/mypy

datamole-ai · Mar 12, 2024 · 3dc8b59 · 3dc8b59
2 parents 4abc8a2 + e2889d4
commit 3dc8b59
Show file tree

Hide file tree

Showing 10 changed files with 251 additions and 272 deletions.
diff --git a/tests/pyarrow_utils.py b/tests/pyarrow_utils.py
@@ -1,8 +1,6 @@
-import pytest
-
 from edvart.data_types import PYARROW_PANDAS_BACKEND_AVAILABLE
 
 if PYARROW_PANDAS_BACKEND_AVAILABLE:
-    pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False, True])
+    pyarrow_params = [True, False]
 else:
-    pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False])
+    pyarrow_params = [False]
diff --git a/tests/test_bivariate_analysis.py b/tests/test_bivariate_analysis.py
@@ -11,12 +11,13 @@
 from edvart.report_sections.section_base import Verbosity
 
 from .execution_utils import check_section_executes
-from .pyarrow_utils import pyarrow_parameterize
+from .pyarrow_utils import pyarrow_params
 
 
-def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame:
+@pytest.fixture(params=pyarrow_params)
+def test_df(request) -> pd.DataFrame:
     test_df = pd.DataFrame(data=[[1.1, "a"], [2.2, "b"], [3.3, "c"]], columns=["A", "B"])
-    if pyarrow_dtypes:
+    if request.param:
         test_df = test_df.convert_dtypes(dtype_backend="pyarrow")
 
     return test_df
@@ -125,7 +126,7 @@ def test_section_adding():
     ), "Subsection should be ContingencyTable"
 
 
-def test_code_export_verbosity_low():
+def test_code_export_verbosity_low(test_df: pd.DataFrame):
     bivariate_section = bivariate_analysis.BivariateAnalysis(verbosity=Verbosity.LOW)
     # Export code
     exported_cells = []
@@ -138,10 +139,10 @@ def test_code_export_verbosity_low():
     assert len(exported_code) == 1
     assert exported_code[0] == expected_code[0], "Exported code mismatch"
 
-    check_section_executes(bivariate_section, df=get_test_df())
+    check_section_executes(bivariate_section, df=test_df)
 
 
-def test_code_export_verbosity_low_with_subsections():
+def test_code_export_verbosity_low_with_subsections(test_df: pd.DataFrame):
     bivariate_section = bivariate_analysis.BivariateAnalysis(
         subsections=[
             BivariateAnalysisSubsection.ContingencyTable,
@@ -164,7 +165,7 @@ def test_code_export_verbosity_low_with_subsections():
     assert len(exported_code) == 1
     assert exported_code[0] == expected_code[0], "Exported code mismatch"
 
-    check_section_executes(bivariate_section, df=get_test_df())
+    check_section_executes(bivariate_section, df=test_df)
 
 
 def test_generated_code_verbosity_low_columns():
@@ -209,7 +210,7 @@ def test_generated_code_verbosity_low_columns():
     check_section_executes(bivariate_section, df=test_df)
 
 
-def test_generated_code_verbosity_medium():
+def test_generated_code_verbosity_medium(test_df: pd.DataFrame):
     bivariate_section = bivariate_analysis.BivariateAnalysis(
         verbosity=Verbosity.MEDIUM,
         subsections=[
@@ -233,7 +234,7 @@ def test_generated_code_verbosity_medium():
     for expected_line, exported_line in zip(expected_code, exported_code):
         assert expected_line == exported_line, "Exported code mismatch"
 
-    check_section_executes(bivariate_section, df=get_test_df())
+    check_section_executes(bivariate_section, df=test_df)
 
 
 def test_generated_code_verbosity_medium_columns_x_y():
@@ -307,7 +308,7 @@ def test_generated_code_verbosity_medium_columns_pairs():
     check_section_executes(bivariate_section, df=test_df)
 
 
-def test_generated_code_verbosity_high():
+def test_generated_code_verbosity_high(test_df: pd.DataFrame):
     bivariate_section = bivariate_analysis.BivariateAnalysis(
         verbosity=Verbosity.HIGH,
         subsections=[
@@ -345,10 +346,10 @@ def test_generated_code_verbosity_high():
     for expected_line, exported_line in zip(expected_code, exported_code):
         assert expected_line == exported_line, "Exported code mismatch"
 
-    check_section_executes(bivariate_section, df=get_test_df())
+    check_section_executes(bivariate_section, df=test_df)
 
 
-def test_verbosity_low_different_subsection_verbosities():
+def test_verbosity_low_different_subsection_verbosities(test_df: pd.DataFrame):
     bivariate_section = BivariateAnalysis(
         verbosity=Verbosity.LOW,
         subsections=[
@@ -377,7 +378,7 @@ def test_verbosity_low_different_subsection_verbosities():
     for expected_line, exported_line in zip(expected_code, exported_code):
         assert expected_line == exported_line, "Exported code mismatch"
 
-    check_section_executes(bivariate_section, df=get_test_df())
+    check_section_executes(bivariate_section, df=test_df)
 
 
 def test_imports_verbosity_low():
@@ -449,10 +450,9 @@ def test_imports_verbosity_low_different_subsection_verbosities():
     assert set(exported_imports) == set(expected_imports)
 
 
-@pyarrow_parameterize
-def test_show(pyarrow_dtypes: bool):
+def test_show(test_df: pd.DataFrame):
     bivariate_section = BivariateAnalysis()
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", UserWarning)
         with redirect_stdout(None):
-            bivariate_section.show(get_test_df(pyarrow_dtypes=pyarrow_dtypes))
+            bivariate_section.show(test_df)
diff --git a/tests/test_data_type_inference.py b/tests/test_data_type_inference.py
@@ -1,115 +1,128 @@
 import numpy as np
 import pandas as pd
+import pytest
 
 from edvart import data_types
 
+from .pyarrow_utils import pyarrow_params
 
-def test_inference():
-    assert (
-        data_types.infer_data_type(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]))
-        == data_types.DataType.NUMERIC
-    ), "Should be numeric type"
-    assert (
-        data_types.infer_data_type(
-            pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
-        )
-        == data_types.DataType.DATE
-    ), "Should be date type"
-    assert (
-        data_types.infer_data_type(pd.Series(["A", "B", "C", "C", "A", "B"]))
-        == data_types.DataType.CATEGORICAL
-    ), "Should be categorical type"
-    assert (
-        data_types.infer_data_type(pd.Series([True, False, False, True, True]))
-        == data_types.DataType.BOOLEAN
-    ), "Should be boolean type"
-    assert data_types.infer_data_type(
-        pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING
-    ), "Should be missing"
-    assert (
-        data_types.infer_data_type(pd.Series(list(range(10)))) == data_types.DataType.UNIQUE
-    ), "Should be unique"
-    assert (
-        data_types.infer_data_type(pd.Series([1] + list(range(100)))) == data_types.DataType.NUMERIC
-    ), "Should be numeric"
-    assert (
-        data_types.infer_data_type(pd.Series(dtype=pd.Float64Dtype)) == data_types.DataType.UNKNOWN
-    ), "Should be unknown"
-    assert data_types.infer_data_type(
-        pd.Series([True, False]) == data_types.DataType.BOOLEAN
-    ), "Should be boolean"
 
+@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        (pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), data_types.DataType.NUMERIC),
+        (
+            pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]),
+            data_types.DataType.DATE,
+        ),
+        (pd.Series(["A", "B", "C", "C", "A", "B"]), data_types.DataType.CATEGORICAL),
+        (pd.Series([True, False, False, True, True]), data_types.DataType.BOOLEAN),
+        (pd.Series([None, None, np.nan, float("nan")]), data_types.DataType.MISSING),
+        (pd.Series(list(range(10))), data_types.DataType.UNIQUE),
+        (pd.Series([1] + list(range(100))), data_types.DataType.NUMERIC),
+        (pd.Series(dtype=pd.Float64Dtype), data_types.DataType.UNKNOWN),
+        (pd.Series([True, False]), data_types.DataType.BOOLEAN),
+    ],
+)
+def test_inference(data, expected, pyarrow_dtypes):
+    if pyarrow_dtypes:
+        data = data.convert_dtypes(dtype_backend="pyarrow")
+    assert data_types.infer_data_type(data) == expected
 
-def test_missing_series():
-    assert data_types.is_missing(pd.Series([None, None, np.nan, float("nan")])), "Should be missing"
-    assert data_types.is_missing(pd.Series([pd.NA])), "Should be missing"
-    assert not data_types.is_missing(pd.Series([1, np.nan, None])), "Should not be missing"
-    assert not data_types.is_missing(pd.Series(["2023-01-01", None])), "Should not be missing"
 
+@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
+@pytest.mark.parametrize(
+    "data, is_missing",
+    [
+        (pd.Series([None, None, np.nan, float("nan")]), True),
+        (pd.Series([pd.NA]), True),
+        (pd.Series([1, np.nan, None]), False),
+        (pd.Series(["2023-01-01", None]), False),
+    ],
+)
+def test_missing_series(data, is_missing, pyarrow_dtypes):
+    if pyarrow_dtypes:
+        data = data.convert_dtypes(dtype_backend="pyarrow")
+    assert data_types.is_missing(data) == is_missing
 
-def test_numeric_series():
-    assert data_types.is_numeric(
-        pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312])
-    ), "Should be numeric type"
-    assert data_types.is_numeric(pd.Series([23, 45, 2, 1.2, -3, -66])), "Should be numeric type"
-    assert not data_types.is_numeric(
-        pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"])
-    ), "Should not be numeric type"
-    assert not data_types.is_numeric(
-        pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"])
-    ), "Should not be numeric type"
-    assert not data_types.is_numeric(
-        pd.Series([None, None, np.nan, float("nan")])
-    ), "Should not be numeric"
 
+@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
+@pytest.mark.parametrize(
+    "data, is_numeric",
+    [
+        (pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), True),
+        (pd.Series([23, 45, 2, 1.2, -3, -66]), True),
+        (pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"]), False),
+        (pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]), False),
+        (pd.Series([None, None, np.nan, float("nan")]), False),
+    ],
+)
+def test_numeric_series(data, is_numeric, pyarrow_dtypes):
+    if pyarrow_dtypes:
+        data = data.convert_dtypes(dtype_backend="pyarrow")
+    assert data_types.is_numeric(data) == is_numeric
 
-def test_categorical_series():
-    assert data_types.is_categorical(pd.Series(["A", "B", "C", "D"])), "Should be categorical"
-    assert data_types.is_categorical(
-        pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4])
-    ), "Should be categorical"
-    assert not data_types.is_categorical(
-        pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8])
-    ), "Should not be categorical"
-    assert not data_types.is_categorical(
-        pd.Series([None, None, np.nan, float("nan")])
-    ), "Should not be categorical"
-    assert not data_types.is_categorical(pd.Series([pd.NA])), "Should not be categorical"
 
+@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
+@pytest.mark.parametrize(
+    "data, is_categorical",
+    [
+        (pd.Series(["A", "B", "C", "D"]), True),
+        (pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4]), True),
+        (
+            pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]),
+            False,
+        ),
+        (pd.Series([None, None, np.nan, float("nan")]), False),
+        (pd.Series([pd.NA]), False),
+    ],
+)
+def test_categorical_series(data, is_categorical, pyarrow_dtypes):
+    if pyarrow_dtypes:
+        data = data.convert_dtypes(dtype_backend="pyarrow")
+    assert data_types.is_categorical(data) == is_categorical
 
-def test_boolean_series():
-    assert data_types.is_boolean(pd.Series([True, False, False, True, True])), "Should be boolean"
-    assert data_types.is_boolean(pd.Series([False, False, False])), "Should be boolean"
-    assert data_types.is_boolean(pd.Series([True, True, True])), "Should be boolean"
-    assert data_types.is_boolean(pd.Series([1, 0, 0, 1])), "Should be boolean"
-    assert data_types.is_boolean(pd.Series([0, 0, 0, 0])), "Should be boolean"
-    assert data_types.is_boolean(pd.Series([1, 1, 1, 1])), "Should be boolean"
-    assert not data_types.is_boolean(
-        pd.Series([True, False, False, True, True, "True"])
-    ), "Should not be boolean"
-    assert not data_types.is_boolean(pd.Series([2, 2, 2, 2])), "Should not be boolean"
-    assert not data_types.is_boolean(pd.Series([1, 0, 0, 1, 3])), "Should not be boolean"
-    assert not data_types.is_boolean(pd.Series(["a", "abc", "2"])), "Should not be boolean"
-    assert not data_types.is_boolean(pd.Series(["A", "B", "A", "A", "B"])), "Should not be boolean"
-    assert not data_types.is_boolean(pd.Series([-0.2, 1.6567, 3, 4, 5])), "Should not be boolean"
-    assert not data_types.is_boolean(pd.Series([None])), "Should not be boolean"
 
+@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
+@pytest.mark.parametrize(
+    "data, is_boolean",
+    [
+        (pd.Series([True, False, False, True, True]), True),
+        (pd.Series([False, False, False]), True),
+        (pd.Series([True, True, True]), True),
+        (pd.Series([1, 0, 0, 1]), True),
+        (pd.Series([0, 0, 0, 0]), True),
+        (pd.Series([1, 1, 1, 1]), True),
+        (pd.Series([True, False, False, True, True, "True"]), False),
+        (pd.Series([2, 2, 2, 2]), False),
+        (pd.Series([1, 0, 0, 1, 3]), False),
+        (pd.Series(["a", "abc", "2"]), False),
+        (pd.Series(["A", "B", "A", "A", "B"]), False),
+        (pd.Series([-0.2, 1.6567, 3, 4, 5]), False),
+        (pd.Series([None]), False),
+    ],
+)
+def test_boolean_series(data, is_boolean, pyarrow_dtypes):
+    if pyarrow_dtypes:
+        data = data.convert_dtypes(dtype_backend="pyarrow")
+    assert data_types.is_boolean(data) == is_boolean
 
-def test_date_series():
-    assert data_types.is_date(
-        pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
-    ), "Should be type date"
-    assert data_types.is_date(
-        pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"])
-    ), "Should be type date"
-    assert not data_types.is_date(
-        pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"])
-    ), "Should not be type date"
-    assert not data_types.is_date(
-        pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3])
-    ), "Should not be type date"
-    assert not data_types.is_date(pd.Series([1, 2, 3, 4, 5])), "Should not be type date"
-    assert not data_types.is_date(pd.Series([None, 2.0, 3, 4, 5])), "Should not be type date"
-    assert data_types.is_date(
-        pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None])
-    ), "Should be type date"
+
+@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
+@pytest.mark.parametrize(
+    "data, is_date",
+    [
+        (pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]), True),
+        (pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]), True),
+        (pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]), False),
+        (pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]), False),
+        (pd.Series([1, 2, 3, 4, 5]), False),
+        (pd.Series([None, 2.0, 3, 4, 5]), False),
+        (pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None]), True),
+    ],
+)
+def test_date_series(data, is_date, pyarrow_dtypes):
+    if pyarrow_dtypes:
+        data = data.convert_dtypes(dtype_backend="pyarrow")
+    assert data_types.is_date(data) == is_date