Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into refactor/mypy
Browse files Browse the repository at this point in the history
  • Loading branch information
mbelak-dtml committed Mar 12, 2024
2 parents 4abc8a2 + e2889d4 commit 3dc8b59
Show file tree
Hide file tree
Showing 10 changed files with 251 additions and 272 deletions.
6 changes: 2 additions & 4 deletions tests/pyarrow_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import pytest

from edvart.data_types import PYARROW_PANDAS_BACKEND_AVAILABLE

if PYARROW_PANDAS_BACKEND_AVAILABLE:
pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False, True])
pyarrow_params = [True, False]
else:
pyarrow_parameterize = pytest.mark.parametrize("pyarrow_dtypes", [False])
pyarrow_params = [False]
32 changes: 16 additions & 16 deletions tests/test_bivariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
from edvart.report_sections.section_base import Verbosity

from .execution_utils import check_section_executes
from .pyarrow_utils import pyarrow_parameterize
from .pyarrow_utils import pyarrow_params


def get_test_df(pyarrow_dtypes: bool = False) -> pd.DataFrame:
@pytest.fixture(params=pyarrow_params)
def test_df(request) -> pd.DataFrame:
test_df = pd.DataFrame(data=[[1.1, "a"], [2.2, "b"], [3.3, "c"]], columns=["A", "B"])
if pyarrow_dtypes:
if request.param:
test_df = test_df.convert_dtypes(dtype_backend="pyarrow")

return test_df
Expand Down Expand Up @@ -125,7 +126,7 @@ def test_section_adding():
), "Subsection should be ContingencyTable"


def test_code_export_verbosity_low():
def test_code_export_verbosity_low(test_df: pd.DataFrame):
bivariate_section = bivariate_analysis.BivariateAnalysis(verbosity=Verbosity.LOW)
# Export code
exported_cells = []
Expand All @@ -138,10 +139,10 @@ def test_code_export_verbosity_low():
assert len(exported_code) == 1
assert exported_code[0] == expected_code[0], "Exported code mismatch"

check_section_executes(bivariate_section, df=get_test_df())
check_section_executes(bivariate_section, df=test_df)


def test_code_export_verbosity_low_with_subsections():
def test_code_export_verbosity_low_with_subsections(test_df: pd.DataFrame):
bivariate_section = bivariate_analysis.BivariateAnalysis(
subsections=[
BivariateAnalysisSubsection.ContingencyTable,
Expand All @@ -164,7 +165,7 @@ def test_code_export_verbosity_low_with_subsections():
assert len(exported_code) == 1
assert exported_code[0] == expected_code[0], "Exported code mismatch"

check_section_executes(bivariate_section, df=get_test_df())
check_section_executes(bivariate_section, df=test_df)


def test_generated_code_verbosity_low_columns():
Expand Down Expand Up @@ -209,7 +210,7 @@ def test_generated_code_verbosity_low_columns():
check_section_executes(bivariate_section, df=test_df)


def test_generated_code_verbosity_medium():
def test_generated_code_verbosity_medium(test_df: pd.DataFrame):
bivariate_section = bivariate_analysis.BivariateAnalysis(
verbosity=Verbosity.MEDIUM,
subsections=[
Expand All @@ -233,7 +234,7 @@ def test_generated_code_verbosity_medium():
for expected_line, exported_line in zip(expected_code, exported_code):
assert expected_line == exported_line, "Exported code mismatch"

check_section_executes(bivariate_section, df=get_test_df())
check_section_executes(bivariate_section, df=test_df)


def test_generated_code_verbosity_medium_columns_x_y():
Expand Down Expand Up @@ -307,7 +308,7 @@ def test_generated_code_verbosity_medium_columns_pairs():
check_section_executes(bivariate_section, df=test_df)


def test_generated_code_verbosity_high():
def test_generated_code_verbosity_high(test_df: pd.DataFrame):
bivariate_section = bivariate_analysis.BivariateAnalysis(
verbosity=Verbosity.HIGH,
subsections=[
Expand Down Expand Up @@ -345,10 +346,10 @@ def test_generated_code_verbosity_high():
for expected_line, exported_line in zip(expected_code, exported_code):
assert expected_line == exported_line, "Exported code mismatch"

check_section_executes(bivariate_section, df=get_test_df())
check_section_executes(bivariate_section, df=test_df)


def test_verbosity_low_different_subsection_verbosities():
def test_verbosity_low_different_subsection_verbosities(test_df: pd.DataFrame):
bivariate_section = BivariateAnalysis(
verbosity=Verbosity.LOW,
subsections=[
Expand Down Expand Up @@ -377,7 +378,7 @@ def test_verbosity_low_different_subsection_verbosities():
for expected_line, exported_line in zip(expected_code, exported_code):
assert expected_line == exported_line, "Exported code mismatch"

check_section_executes(bivariate_section, df=get_test_df())
check_section_executes(bivariate_section, df=test_df)


def test_imports_verbosity_low():
Expand Down Expand Up @@ -449,10 +450,9 @@ def test_imports_verbosity_low_different_subsection_verbosities():
assert set(exported_imports) == set(expected_imports)


@pyarrow_parameterize
def test_show(pyarrow_dtypes: bool):
def test_show(test_df: pd.DataFrame):
bivariate_section = BivariateAnalysis()
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
with redirect_stdout(None):
bivariate_section.show(get_test_df(pyarrow_dtypes=pyarrow_dtypes))
bivariate_section.show(test_df)
211 changes: 112 additions & 99 deletions tests/test_data_type_inference.py
Original file line number Diff line number Diff line change
@@ -1,115 +1,128 @@
import numpy as np
import pandas as pd
import pytest

from edvart import data_types

from .pyarrow_utils import pyarrow_params

def test_inference():
assert (
data_types.infer_data_type(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]))
== data_types.DataType.NUMERIC
), "Should be numeric type"
assert (
data_types.infer_data_type(
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
)
== data_types.DataType.DATE
), "Should be date type"
assert (
data_types.infer_data_type(pd.Series(["A", "B", "C", "C", "A", "B"]))
== data_types.DataType.CATEGORICAL
), "Should be categorical type"
assert (
data_types.infer_data_type(pd.Series([True, False, False, True, True]))
== data_types.DataType.BOOLEAN
), "Should be boolean type"
assert data_types.infer_data_type(
pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING
), "Should be missing"
assert (
data_types.infer_data_type(pd.Series(list(range(10)))) == data_types.DataType.UNIQUE
), "Should be unique"
assert (
data_types.infer_data_type(pd.Series([1] + list(range(100)))) == data_types.DataType.NUMERIC
), "Should be numeric"
assert (
data_types.infer_data_type(pd.Series(dtype=pd.Float64Dtype)) == data_types.DataType.UNKNOWN
), "Should be unknown"
assert data_types.infer_data_type(
pd.Series([True, False]) == data_types.DataType.BOOLEAN
), "Should be boolean"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, expected",
[
(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), data_types.DataType.NUMERIC),
(
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]),
data_types.DataType.DATE,
),
(pd.Series(["A", "B", "C", "C", "A", "B"]), data_types.DataType.CATEGORICAL),
(pd.Series([True, False, False, True, True]), data_types.DataType.BOOLEAN),
(pd.Series([None, None, np.nan, float("nan")]), data_types.DataType.MISSING),
(pd.Series(list(range(10))), data_types.DataType.UNIQUE),
(pd.Series([1] + list(range(100))), data_types.DataType.NUMERIC),
(pd.Series(dtype=pd.Float64Dtype), data_types.DataType.UNKNOWN),
(pd.Series([True, False]), data_types.DataType.BOOLEAN),
],
)
def test_inference(data, expected, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.infer_data_type(data) == expected

def test_missing_series():
assert data_types.is_missing(pd.Series([None, None, np.nan, float("nan")])), "Should be missing"
assert data_types.is_missing(pd.Series([pd.NA])), "Should be missing"
assert not data_types.is_missing(pd.Series([1, np.nan, None])), "Should not be missing"
assert not data_types.is_missing(pd.Series(["2023-01-01", None])), "Should not be missing"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_missing",
[
(pd.Series([None, None, np.nan, float("nan")]), True),
(pd.Series([pd.NA]), True),
(pd.Series([1, np.nan, None]), False),
(pd.Series(["2023-01-01", None]), False),
],
)
def test_missing_series(data, is_missing, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_missing(data) == is_missing

def test_numeric_series():
assert data_types.is_numeric(
pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312])
), "Should be numeric type"
assert data_types.is_numeric(pd.Series([23, 45, 2, 1.2, -3, -66])), "Should be numeric type"
assert not data_types.is_numeric(
pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"])
), "Should not be numeric type"
assert not data_types.is_numeric(
pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"])
), "Should not be numeric type"
assert not data_types.is_numeric(
pd.Series([None, None, np.nan, float("nan")])
), "Should not be numeric"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_numeric",
[
(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), True),
(pd.Series([23, 45, 2, 1.2, -3, -66]), True),
(pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"]), False),
(pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]), False),
(pd.Series([None, None, np.nan, float("nan")]), False),
],
)
def test_numeric_series(data, is_numeric, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_numeric(data) == is_numeric

def test_categorical_series():
assert data_types.is_categorical(pd.Series(["A", "B", "C", "D"])), "Should be categorical"
assert data_types.is_categorical(
pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4])
), "Should be categorical"
assert not data_types.is_categorical(
pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8])
), "Should not be categorical"
assert not data_types.is_categorical(
pd.Series([None, None, np.nan, float("nan")])
), "Should not be categorical"
assert not data_types.is_categorical(pd.Series([pd.NA])), "Should not be categorical"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_categorical",
[
(pd.Series(["A", "B", "C", "D"]), True),
(pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4]), True),
(
pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]),
False,
),
(pd.Series([None, None, np.nan, float("nan")]), False),
(pd.Series([pd.NA]), False),
],
)
def test_categorical_series(data, is_categorical, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_categorical(data) == is_categorical

def test_boolean_series():
assert data_types.is_boolean(pd.Series([True, False, False, True, True])), "Should be boolean"
assert data_types.is_boolean(pd.Series([False, False, False])), "Should be boolean"
assert data_types.is_boolean(pd.Series([True, True, True])), "Should be boolean"
assert data_types.is_boolean(pd.Series([1, 0, 0, 1])), "Should be boolean"
assert data_types.is_boolean(pd.Series([0, 0, 0, 0])), "Should be boolean"
assert data_types.is_boolean(pd.Series([1, 1, 1, 1])), "Should be boolean"
assert not data_types.is_boolean(
pd.Series([True, False, False, True, True, "True"])
), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([2, 2, 2, 2])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([1, 0, 0, 1, 3])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series(["a", "abc", "2"])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series(["A", "B", "A", "A", "B"])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([-0.2, 1.6567, 3, 4, 5])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([None])), "Should not be boolean"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_boolean",
[
(pd.Series([True, False, False, True, True]), True),
(pd.Series([False, False, False]), True),
(pd.Series([True, True, True]), True),
(pd.Series([1, 0, 0, 1]), True),
(pd.Series([0, 0, 0, 0]), True),
(pd.Series([1, 1, 1, 1]), True),
(pd.Series([True, False, False, True, True, "True"]), False),
(pd.Series([2, 2, 2, 2]), False),
(pd.Series([1, 0, 0, 1, 3]), False),
(pd.Series(["a", "abc", "2"]), False),
(pd.Series(["A", "B", "A", "A", "B"]), False),
(pd.Series([-0.2, 1.6567, 3, 4, 5]), False),
(pd.Series([None]), False),
],
)
def test_boolean_series(data, is_boolean, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_boolean(data) == is_boolean

def test_date_series():
assert data_types.is_date(
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
), "Should be type date"
assert data_types.is_date(
pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"])
), "Should be type date"
assert not data_types.is_date(
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"])
), "Should not be type date"
assert not data_types.is_date(
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3])
), "Should not be type date"
assert not data_types.is_date(pd.Series([1, 2, 3, 4, 5])), "Should not be type date"
assert not data_types.is_date(pd.Series([None, 2.0, 3, 4, 5])), "Should not be type date"
assert data_types.is_date(
pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None])
), "Should be type date"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_date",
[
(pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]), True),
(pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]), True),
(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]), False),
(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]), False),
(pd.Series([1, 2, 3, 4, 5]), False),
(pd.Series([None, 2.0, 3, 4, 5]), False),
(pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None]), True),
],
)
def test_date_series(data, is_date, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_date(data) == is_date
Loading

0 comments on commit 3dc8b59

Please sign in to comment.