Skip to content

Commit

Permalink
Add example dataset and docs (#15)
Browse files Browse the repository at this point in the history
* add example dataset module

* add dataset and some docs

* conditional install

* conditional install less than 3.10

* less than 10
  • Loading branch information
wd60622 authored Jan 30, 2024
1 parent a121f4a commit cf60fad
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 11 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![PyPI version](https://badge.fury.io/py/pandas-bootstrap.svg)](https://badge.fury.io/py/pandas-bootstrap)
[![docs](https://github.com/wd60622/pandas-bootstrap/actions/workflows/docs.yml/badge.svg)](https://wd60622.github.io/pandas-bootstrap/)

Bootrapping with Pandas made easy.
Statistical Bootstrap with Pandas made easy.

## Installation

Expand Down
21 changes: 17 additions & 4 deletions bootstrap/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
from inspect import signature, Signature
from typing import Any, Callable, List, Dict, Union, Optional, Tuple

import sys

if sys.version_info < (3, 10):
from typing_extensions import ParamSpec
else:
from typing import ParamSpec

from joblib import Parallel, delayed

import numpy as np
Expand All @@ -15,6 +22,8 @@
BFUNC_OUTPUT = Union[pd.DataFrame, pd.Series]
BFUNC = Callable[[BFUNC_INPUT], BFUNC_INPUT]

P = ParamSpec("P")


class UnsupportedReturnType(Exception):
"""Raised when a bootstrap function returns an unsupported type."""
Expand All @@ -26,7 +35,9 @@ def get_return_type(bfunc: BFUNC) -> type:
return sig.return_annotation


def infer_return_type(df: BFUNC_INPUT, bfunc: BFUNC, **kwargs) -> Tuple[BFUNC, type]:
def infer_return_type(
df: BFUNC_INPUT, bfunc: BFUNC, **kwargs: P.kwargs
) -> Tuple[BFUNC, type]:
"""Infer the return type of a bootstrap function.
Args:
Expand Down Expand Up @@ -111,7 +122,7 @@ def get_bfunc_processor(return_type: type) -> Union[DataFrameFunction, SeriesFun


def create_inner_loop_func(
bfunc, bfunc_input, bfunc_processor, sample_kwargs, **kwargs
bfunc, bfunc_input, bfunc_processor, sample_kwargs, **kwargs: P.kwargs
):
def inner_loop(i):
boot_sample = bfunc(bfunc_input.sample(**sample_kwargs), **kwargs)
Expand All @@ -122,7 +133,9 @@ def inner_loop(i):
return inner_loop


def inner_loop(bfunc, bfunc_input, bfunc_processor, i, sample_kwargs, **kwargs):
def inner_loop(
bfunc, bfunc_input, bfunc_processor, i, sample_kwargs, **kwargs: P.kwargs
):
boot_sample = bfunc(bfunc_input.sample(**sample_kwargs), **kwargs)
boot_sample = bfunc_processor.name(boot_sample, i)

Expand Down Expand Up @@ -154,7 +167,7 @@ def bootstrap(
B: int = 100,
sample_kwargs: Optional[Dict[str, Any]] = None,
parallel: Optional[Parallel] = None,
**kwargs,
**kwargs: P.kwargs,
) -> Union[pd.DataFrame, pd.Series]:
"""Core bootstrap function.
Expand Down
112 changes: 112 additions & 0 deletions bootstrap/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Example datasets to test out the boot attribute."""
from __future__ import annotations

from typing import List, Union

import pandas as pd
import numpy as np


ArrayLike = Union[List, np.ndarray, pd.Series]


def different_mean_and_sigma(n: int, groups: int = 2, random_state: int | None = None, mu: ArrayLike | None = None, sigma: ArrayLike | None =None) -> pd.DataFrame:
"""Generate a dataset with different mean and sigma for each group.
Args:
n: Number of samples
groups: Number of groups
random_state: Random state
mu: Optional mean for each group
sigma: Optional sigma for each group
Returns:
A DataFrame with two columns: group and x
Examples:
Get the confidence intervals for the mean and sigma
```python
import pandas as pd
from bootstrap.datasets import different_mean_and_sigma
n = 100
mu = [1, 2]
sigma = [1, 4]
df = different_mean_and_sigma(n=n, random_state=0, mu=mu, sigma=sigma)
```
Which looks like this:
```text
group x
0 1 3.429522
1 1 -2.833275
2 1 1.982183
3 0 1.656475
4 0 -0.288361
.. ... ...
95 1 0.564347
96 0 -0.901635
97 0 0.891085
98 0 0.196268
99 1 6.320654
[100 rows x 2 columns]
```
By define our statistic function and pass to `get_samples` method.
```python
def mean_and_sigma_by_group(df: pd.DataFrame) -> pd.DataFrame:
return df.groupby("group")["x"].agg(['mean', 'std'])
B = 1_000
sample_kwargs = {"random_state": 0}
df_boot = df.boot.get_samples(
mean_and_sigma_by_group,
B=B,
sample_kwargs=sample_kwargs
)
```
Which looks like this:
```text
mean std
group sample
0 0 1.011128 0.960958
1 0.995973 0.874010
2 0.961375 0.941634
3 0.986562 0.848745
4 0.749629 0.982982
... ... ...
1 995 1.797657 4.103178
996 2.836222 3.584542
997 2.587314 3.873845
998 3.176353 3.444296
999 2.817353 3.597222
[2000 rows x 2 columns]
```
"""

rng = np.random.default_rng(random_state)

mu = mu or rng.normal(0, 1, size=groups)
sigma = sigma or rng.gamma(1, 1, size=groups)

mu = np.array(mu)
sigma = np.array(sigma)

if len(mu) != groups or len(sigma) != groups:
raise ValueError(f"mu and sigma must have the same length as groups. {groups = }")

group = rng.choice(np.arange(groups), size=n)

return pd.DataFrame({
'group': group,
'x': rng.normal(mu[group], sigma[group], size=n),
})



11 changes: 10 additions & 1 deletion bootstrap/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,24 @@
```
"""
import sys
from typing import Any, Dict, Optional, Union

if sys.version_info < (3, 10):
from typing_extensions import ParamSpec
else:
from typing import ParamSpec

from joblib import Parallel

import pandas as pd

from bootstrap.bootstrap import bootstrap, BFUNC


P = ParamSpec("P")


class AccessorMixin:
"""Common functionality for DataFrame and Series accessors."""

Expand All @@ -34,7 +43,7 @@ def get_samples(
B: int = 100,
sample_kwargs: Dict[str, Any] = None,
parallel: Optional[Parallel] = None,
**kwargs,
**kwargs: P.kwargs,
) -> Union[pd.Series, pd.DataFrame]:
"""Get bootstrap samples of the object.
Expand Down
6 changes: 6 additions & 0 deletions docs/datasets.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
comments: true
---
# datasets

::: bootstrap.datasets
12 changes: 12 additions & 0 deletions docs/examples/groupby-input.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
comments: true
---
# GroupBy Input

The `DataFrameGroupBy` and `SeriesGroupBy` do not have `boot` attributes.
However, the `groupby` method can be used in the `bfunc`.

!!! note
Each group will have different sample sizes than the original dataset.

Check out the example [here](../datasets.md#bootstrap.datasets.different_mean_and_sigma) which uses a `bfunc` with the `groupby` method.
4 changes: 2 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ hide:
---
# Pandas Bootstrap

Bootrapping with Pandas made easy.
Statistical Bootstrap with Pandas made easy.

## Installation

Expand Down Expand Up @@ -55,4 +55,4 @@ sample
4 3.0 8.0
```

Read more in examples [here](./examples/linear-regression.md).
Read more in examples [here](./examples/linear-regression.md).
4 changes: 4 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@ nav:
- Modules:
- extensions.md
- bootstrap.md
- datasets.md
- Examples:
- Linear Regression: examples/linear-regression.md
- examples/correlation.md
- examples/return-type.md
- examples/parallelization.md
- examples/groupby-input.md

plugins:
- search
Expand All @@ -62,6 +64,8 @@ markdown_extensions:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- admonition
- pymdownx.inlinehilite
- pymdownx.details
- pymdownx.snippets
- pymdownx.superfences
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "pandas-bootstrap"
version = "0.2.0"
description = "Bootstrapping with Pandas made easy"
version = "0.2.1"
description = "Statistical Bootstrap with Pandas made easy"
authors = ["Will Dean <wd60622@gmail.com>"]
readme = "README.md"
license = "MIT"
Expand Down Expand Up @@ -30,7 +30,7 @@ classifiers = [
python = ">=3.8,<4.0"
pandas = ">=1.0.0"
joblib = "^1.1.1"

typing_extensions = { version = "*", python = "<3.10" }

[tool.poetry.group.dev.dependencies]
pytest = "^7.4.0"
Expand Down
26 changes: 26 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pandas as pd

from bootstrap.datasets import different_mean_and_sigma


def test_different_mean_and_sigma() -> None:
n = 10
df = different_mean_and_sigma(n=n, random_state=0)

assert isinstance(df, pd.DataFrame)
assert df.shape == (n, 2)
assert df.columns.tolist() == ["group", "x"]


def test_different_mean_and_sigma_with_mu_sigma() -> None:
n = 1000
mu = [0, 1]
sigma = [1, 2]
df = different_mean_and_sigma(n=n, random_state=0, mu=mu, sigma=sigma)

mean = df.groupby('group')["x"].mean()
std = df.groupby("group")["x"].std()

assert mean[0] < mean[1]
assert std[0] < std[1]

0 comments on commit cf60fad

Please sign in to comment.