diff --git a/README.md b/README.md index 7c89b3d..1fd347d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![PyPI version](https://badge.fury.io/py/pandas-bootstrap.svg)](https://badge.fury.io/py/pandas-bootstrap) [![docs](https://github.com/wd60622/pandas-bootstrap/actions/workflows/docs.yml/badge.svg)](https://wd60622.github.io/pandas-bootstrap/) -Bootrapping with Pandas made easy. +Statistical Bootstrap with Pandas made easy. ## Installation diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py index 04f4789..a27357d 100644 --- a/bootstrap/bootstrap.py +++ b/bootstrap/bootstrap.py @@ -5,6 +5,13 @@ from inspect import signature, Signature from typing import Any, Callable, List, Dict, Union, Optional, Tuple +import sys + +if sys.version_info < (3, 10): + from typing_extensions import ParamSpec +else: + from typing import ParamSpec + from joblib import Parallel, delayed import numpy as np @@ -15,6 +22,8 @@ BFUNC_OUTPUT = Union[pd.DataFrame, pd.Series] BFUNC = Callable[[BFUNC_INPUT], BFUNC_INPUT] +P = ParamSpec("P") + class UnsupportedReturnType(Exception): """Raised when a bootstrap function returns an unsupported type.""" @@ -26,7 +35,9 @@ def get_return_type(bfunc: BFUNC) -> type: return sig.return_annotation -def infer_return_type(df: BFUNC_INPUT, bfunc: BFUNC, **kwargs) -> Tuple[BFUNC, type]: +def infer_return_type( + df: BFUNC_INPUT, bfunc: BFUNC, **kwargs: P.kwargs +) -> Tuple[BFUNC, type]: """Infer the return type of a bootstrap function. Args: @@ -111,7 +122,7 @@ def get_bfunc_processor(return_type: type) -> Union[DataFrameFunction, SeriesFun def create_inner_loop_func( - bfunc, bfunc_input, bfunc_processor, sample_kwargs, **kwargs + bfunc, bfunc_input, bfunc_processor, sample_kwargs, **kwargs: P.kwargs ): def inner_loop(i): boot_sample = bfunc(bfunc_input.sample(**sample_kwargs), **kwargs) @@ -122,7 +133,9 @@ def inner_loop(i): return inner_loop -def inner_loop(bfunc, bfunc_input, bfunc_processor, i, sample_kwargs, **kwargs): +def inner_loop( + bfunc, bfunc_input, bfunc_processor, i, sample_kwargs, **kwargs: P.kwargs +): boot_sample = bfunc(bfunc_input.sample(**sample_kwargs), **kwargs) boot_sample = bfunc_processor.name(boot_sample, i) @@ -154,7 +167,7 @@ def bootstrap( B: int = 100, sample_kwargs: Optional[Dict[str, Any]] = None, parallel: Optional[Parallel] = None, - **kwargs, + **kwargs: P.kwargs, ) -> Union[pd.DataFrame, pd.Series]: """Core bootstrap function. diff --git a/bootstrap/datasets.py b/bootstrap/datasets.py new file mode 100644 index 0000000..2f10a1c --- /dev/null +++ b/bootstrap/datasets.py @@ -0,0 +1,112 @@ +"""Example datasets to test out the boot attribute.""" +from __future__ import annotations + +from typing import List, Union + +import pandas as pd +import numpy as np + + +ArrayLike = Union[List, np.ndarray, pd.Series] + + +def different_mean_and_sigma(n: int, groups: int = 2, random_state: int | None = None, mu: ArrayLike | None = None, sigma: ArrayLike | None =None) -> pd.DataFrame: + """Generate a dataset with different mean and sigma for each group. + + Args: + n: Number of samples + groups: Number of groups + random_state: Random state + mu: Optional mean for each group + sigma: Optional sigma for each group + + Returns: + A DataFrame with two columns: group and x + + Examples: + Get the confidence intervals for the mean and sigma + + ```python + import pandas as pd + + from bootstrap.datasets import different_mean_and_sigma + + n = 100 + mu = [1, 2] + sigma = [1, 4] + df = different_mean_and_sigma(n=n, random_state=0, mu=mu, sigma=sigma) + ``` + Which looks like this: + ```text + group x + 0 1 3.429522 + 1 1 -2.833275 + 2 1 1.982183 + 3 0 1.656475 + 4 0 -0.288361 + .. ... ... + 95 1 0.564347 + 96 0 -0.901635 + 97 0 0.891085 + 98 0 0.196268 + 99 1 6.320654 + + [100 rows x 2 columns] + ``` + + By define our statistic function and pass to `get_samples` method. + + ```python + def mean_and_sigma_by_group(df: pd.DataFrame) -> pd.DataFrame: + return df.groupby("group")["x"].agg(['mean', 'std']) + + B = 1_000 + sample_kwargs = {"random_state": 0} + df_boot = df.boot.get_samples( + mean_and_sigma_by_group, + B=B, + sample_kwargs=sample_kwargs + ) + ``` + Which looks like this: + + ```text + mean std + group sample + 0 0 1.011128 0.960958 + 1 0.995973 0.874010 + 2 0.961375 0.941634 + 3 0.986562 0.848745 + 4 0.749629 0.982982 + ... ... ... + 1 995 1.797657 4.103178 + 996 2.836222 3.584542 + 997 2.587314 3.873845 + 998 3.176353 3.444296 + 999 2.817353 3.597222 + + [2000 rows x 2 columns] + ``` + + """ + + rng = np.random.default_rng(random_state) + + mu = mu or rng.normal(0, 1, size=groups) + sigma = sigma or rng.gamma(1, 1, size=groups) + + mu = np.array(mu) + sigma = np.array(sigma) + + if len(mu) != groups or len(sigma) != groups: + raise ValueError(f"mu and sigma must have the same length as groups. {groups = }") + + group = rng.choice(np.arange(groups), size=n) + + return pd.DataFrame({ + 'group': group, + 'x': rng.normal(mu[group], sigma[group], size=n), + }) + + + diff --git a/bootstrap/extensions.py b/bootstrap/extensions.py index 9445707..7fa9d32 100644 --- a/bootstrap/extensions.py +++ b/bootstrap/extensions.py @@ -13,8 +13,14 @@ ``` """ +import sys from typing import Any, Dict, Optional, Union +if sys.version_info < (3, 10): + from typing_extensions import ParamSpec +else: + from typing import ParamSpec + from joblib import Parallel import pandas as pd @@ -22,6 +28,9 @@ from bootstrap.bootstrap import bootstrap, BFUNC +P = ParamSpec("P") + + class AccessorMixin: """Common functionality for DataFrame and Series accessors.""" @@ -34,7 +43,7 @@ def get_samples( B: int = 100, sample_kwargs: Dict[str, Any] = None, parallel: Optional[Parallel] = None, - **kwargs, + **kwargs: P.kwargs, ) -> Union[pd.Series, pd.DataFrame]: """Get bootstrap samples of the object. diff --git a/docs/datasets.md b/docs/datasets.md new file mode 100644 index 0000000..fb07f1e --- /dev/null +++ b/docs/datasets.md @@ -0,0 +1,6 @@ +--- +comments: true +--- +# datasets + +::: bootstrap.datasets diff --git a/docs/examples/groupby-input.md b/docs/examples/groupby-input.md new file mode 100644 index 0000000..5dacdb5 --- /dev/null +++ b/docs/examples/groupby-input.md @@ -0,0 +1,12 @@ +--- +comments: true +--- +# GroupBy Input + +The `DataFrameGroupBy` and `SeriesGroupBy` do not have `boot` attributes. +However, the `groupby` method can be used in the `bfunc`. + +!!! note + Each group will have different sample sizes than the original dataset. + +Check out the example [here](../datasets.md#bootstrap.datasets.different_mean_and_sigma) which uses a `bfunc` with the `groupby` method. diff --git a/docs/index.md b/docs/index.md index 41080df..f782286 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,7 +4,7 @@ hide: --- # Pandas Bootstrap -Bootrapping with Pandas made easy. +Statistical Bootstrap with Pandas made easy. ## Installation @@ -55,4 +55,4 @@ sample 4 3.0 8.0 ``` -Read more in examples [here](./examples/linear-regression.md). \ No newline at end of file +Read more in examples [here](./examples/linear-regression.md). diff --git a/mkdocs.yml b/mkdocs.yml index 72c9c3a..4aeb0b9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,11 +42,13 @@ nav: - Modules: - extensions.md - bootstrap.md + - datasets.md - Examples: - Linear Regression: examples/linear-regression.md - examples/correlation.md - examples/return-type.md - examples/parallelization.md + - examples/groupby-input.md plugins: - search @@ -62,6 +64,8 @@ markdown_extensions: anchor_linenums: true line_spans: __span pygments_lang_class: true + - admonition - pymdownx.inlinehilite + - pymdownx.details - pymdownx.snippets - pymdownx.superfences diff --git a/pyproject.toml b/pyproject.toml index 418c75a..821adf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "pandas-bootstrap" -version = "0.2.0" -description = "Bootstrapping with Pandas made easy" +version = "0.2.1" +description = "Statistical Bootstrap with Pandas made easy" authors = ["Will Dean "] readme = "README.md" license = "MIT" @@ -30,7 +30,7 @@ classifiers = [ python = ">=3.8,<4.0" pandas = ">=1.0.0" joblib = "^1.1.1" - +typing_extensions = { version = "*", python = "<3.10" } [tool.poetry.group.dev.dependencies] pytest = "^7.4.0" diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..42ff11c --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,26 @@ +import pandas as pd + +from bootstrap.datasets import different_mean_and_sigma + + +def test_different_mean_and_sigma() -> None: + n = 10 + df = different_mean_and_sigma(n=n, random_state=0) + + assert isinstance(df, pd.DataFrame) + assert df.shape == (n, 2) + assert df.columns.tolist() == ["group", "x"] + + +def test_different_mean_and_sigma_with_mu_sigma() -> None: + n = 1000 + mu = [0, 1] + sigma = [1, 2] + df = different_mean_and_sigma(n=n, random_state=0, mu=mu, sigma=sigma) + + mean = df.groupby('group')["x"].mean() + std = df.groupby("group")["x"].std() + + assert mean[0] < mean[1] + assert std[0] < std[1] +