Add example dataset and docs (#15)

* add example dataset module * add dataset and some docs * conditional install * conditional install less than 3.10 * less than 10
wd60622 · Jan 30, 2024 · cf60fad · cf60fad
1 parent a121f4a
commit cf60fad
Show file tree

Hide file tree

Showing 10 changed files with 193 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 [![PyPI version](https://badge.fury.io/py/pandas-bootstrap.svg)](https://badge.fury.io/py/pandas-bootstrap) 
 [![docs](https://github.com/wd60622/pandas-bootstrap/actions/workflows/docs.yml/badge.svg)](https://wd60622.github.io/pandas-bootstrap/)
 
-Bootrapping with Pandas made easy.
+Statistical Bootstrap with Pandas made easy.
 
 ## Installation
 

diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py
@@ -5,6 +5,13 @@
 from inspect import signature, Signature
 from typing import Any, Callable, List, Dict, Union, Optional, Tuple
 
+import sys
+
+if sys.version_info < (3, 10):
+    from typing_extensions import ParamSpec
+else:
+    from typing import ParamSpec
+
 from joblib import Parallel, delayed
 
 import numpy as np
@@ -15,6 +22,8 @@
 BFUNC_OUTPUT = Union[pd.DataFrame, pd.Series]
 BFUNC = Callable[[BFUNC_INPUT], BFUNC_INPUT]
 
+P = ParamSpec("P")
+
 
 class UnsupportedReturnType(Exception):
     """Raised when a bootstrap function returns an unsupported type."""
@@ -26,7 +35,9 @@ def get_return_type(bfunc: BFUNC) -> type:
     return sig.return_annotation
 
 
-def infer_return_type(df: BFUNC_INPUT, bfunc: BFUNC, **kwargs) -> Tuple[BFUNC, type]:
+def infer_return_type(
+    df: BFUNC_INPUT, bfunc: BFUNC, **kwargs: P.kwargs
+) -> Tuple[BFUNC, type]:
     """Infer the return type of a bootstrap function.
 
     Args:
@@ -111,7 +122,7 @@ def get_bfunc_processor(return_type: type) -> Union[DataFrameFunction, SeriesFun
 
 
 def create_inner_loop_func(
-    bfunc, bfunc_input, bfunc_processor, sample_kwargs, **kwargs
+    bfunc, bfunc_input, bfunc_processor, sample_kwargs, **kwargs: P.kwargs
 ):
     def inner_loop(i):
         boot_sample = bfunc(bfunc_input.sample(**sample_kwargs), **kwargs)
@@ -122,7 +133,9 @@ def inner_loop(i):
     return inner_loop
 
 
-def inner_loop(bfunc, bfunc_input, bfunc_processor, i, sample_kwargs, **kwargs):
+def inner_loop(
+    bfunc, bfunc_input, bfunc_processor, i, sample_kwargs, **kwargs: P.kwargs
+):
     boot_sample = bfunc(bfunc_input.sample(**sample_kwargs), **kwargs)
     boot_sample = bfunc_processor.name(boot_sample, i)
 
@@ -154,7 +167,7 @@ def bootstrap(
     B: int = 100,
     sample_kwargs: Optional[Dict[str, Any]] = None,
     parallel: Optional[Parallel] = None,
-    **kwargs,
+    **kwargs: P.kwargs,
 ) -> Union[pd.DataFrame, pd.Series]:
     """Core bootstrap function.
 

diff --git a/bootstrap/datasets.py b/bootstrap/datasets.py
@@ -0,0 +1,112 @@
+"""Example datasets to test out the boot attribute."""
+from __future__ import annotations
+
+from typing import List, Union
+
+import pandas as pd
+import numpy as np
+
+
+ArrayLike = Union[List, np.ndarray, pd.Series]
+
+
+def different_mean_and_sigma(n: int, groups: int = 2, random_state: int | None = None, mu: ArrayLike | None = None, sigma: ArrayLike | None =None) -> pd.DataFrame:
+    """Generate a dataset with different mean and sigma for each group.
+
+    Args: 
+        n: Number of samples
+        groups: Number of groups
+        random_state: Random state
+        mu: Optional mean for each group
+        sigma: Optional sigma for each group
+
+    Returns:
+        A DataFrame with two columns: group and x
+
+    Examples: 
+        Get the confidence intervals for the mean and sigma
+
+        ```python
+        import pandas as pd
+
+        from bootstrap.datasets import different_mean_and_sigma
+
+        n = 100
+        mu = [1, 2]
+        sigma = [1, 4]
+        df = different_mean_and_sigma(n=n, random_state=0, mu=mu, sigma=sigma)
+        ```
+        Which looks like this: 
+        ```text
+            group         x
+        0       1  3.429522
+        1       1 -2.833275
+        2       1  1.982183
+        3       0  1.656475
+        4       0 -0.288361
+        ..    ...       ...
+        95      1  0.564347
+        96      0 -0.901635
+        97      0  0.891085
+        98      0  0.196268
+        99      1  6.320654
+
+        [100 rows x 2 columns] 
+        ```
+
+        By define our statistic function and pass to `get_samples` method.
+
+        ```python
+        def mean_and_sigma_by_group(df: pd.DataFrame) -> pd.DataFrame: 
+            return df.groupby("group")["x"].agg(['mean', 'std'])
+            
+        B = 1_000 
+        sample_kwargs = {"random_state": 0}
+        df_boot = df.boot.get_samples(
+            mean_and_sigma_by_group,
+            B=B,
+            sample_kwargs=sample_kwargs
+        )
+        ```
+        Which looks like this:
+
+        ```text
+                          mean       std 
+        group sample
+        0     0       1.011128  0.960958
+              1       0.995973  0.874010
+              2       0.961375  0.941634
+              3       0.986562  0.848745
+              4       0.749629  0.982982
+        ...                ...       ...
+        1     995     1.797657  4.103178
+              996     2.836222  3.584542
+              997     2.587314  3.873845
+              998     3.176353  3.444296
+              999     2.817353  3.597222
+
+        [2000 rows x 2 columns]
+        ```
+
+    """
+
+    rng = np.random.default_rng(random_state)
+
+    mu = mu or rng.normal(0, 1, size=groups)
+    sigma = sigma or rng.gamma(1, 1, size=groups)
+
+    mu = np.array(mu)
+    sigma = np.array(sigma)
+
+    if len(mu) != groups or len(sigma) != groups:
+        raise ValueError(f"mu and sigma must have the same length as groups. {groups = }")
+
+    group = rng.choice(np.arange(groups), size=n)
+
+    return pd.DataFrame({
+        'group': group, 
+        'x': rng.normal(mu[group], sigma[group], size=n),
+    })
+
+
+
diff --git a/bootstrap/extensions.py b/bootstrap/extensions.py
@@ -13,15 +13,24 @@
 ```
 
 """
+import sys
 from typing import Any, Dict, Optional, Union
 
+if sys.version_info < (3, 10):
+    from typing_extensions import ParamSpec
+else:
+    from typing import ParamSpec
+
 from joblib import Parallel
 
 import pandas as pd
 
 from bootstrap.bootstrap import bootstrap, BFUNC
 
 
+P = ParamSpec("P")
+
+
 class AccessorMixin:
     """Common functionality for DataFrame and Series accessors."""
 
@@ -34,7 +43,7 @@ def get_samples(
         B: int = 100,
         sample_kwargs: Dict[str, Any] = None,
         parallel: Optional[Parallel] = None,
-        **kwargs,
+        **kwargs: P.kwargs,
     ) -> Union[pd.Series, pd.DataFrame]:
         """Get bootstrap samples of the object.
 

diff --git a/docs/datasets.md b/docs/datasets.md
@@ -0,0 +1,6 @@
+---
+comments: true
+---
+# datasets
+
+::: bootstrap.datasets
diff --git a/docs/examples/groupby-input.md b/docs/examples/groupby-input.md
@@ -0,0 +1,12 @@
+---
+comments: true
+---
+# GroupBy Input
+
+The `DataFrameGroupBy` and `SeriesGroupBy` do not have `boot` attributes. 
+However, the `groupby` method can be used in the `bfunc`. 
+
+!!! note 
+    Each group will have different sample sizes than the original dataset.
+
+Check out the example [here](../datasets.md#bootstrap.datasets.different_mean_and_sigma) which uses a `bfunc` with the `groupby` method.
diff --git a/docs/index.md b/docs/index.md
@@ -4,7 +4,7 @@ hide:
 ---
 # Pandas Bootstrap
 
-Bootrapping with Pandas made easy.
+Statistical Bootstrap with Pandas made easy.
 
 ## Installation
 
@@ -55,4 +55,4 @@ sample
 4       3.0  8.0
 ```
 
-Read more in examples [here](./examples/linear-regression.md).
+Read more in examples [here](./examples/linear-regression.md).
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -42,11 +42,13 @@ nav:
   - Modules: 
     - extensions.md
     - bootstrap.md
+    - datasets.md
   - Examples: 
     - Linear Regression: examples/linear-regression.md
     - examples/correlation.md
     - examples/return-type.md
     - examples/parallelization.md
+    - examples/groupby-input.md
 
 plugins:
 - search
@@ -62,6 +64,8 @@ markdown_extensions:
       anchor_linenums: true
       line_spans: __span
       pygments_lang_class: true
+  - admonition
   - pymdownx.inlinehilite
+  - pymdownx.details
   - pymdownx.snippets
   - pymdownx.superfences
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "pandas-bootstrap"
-version = "0.2.0"
-description = "Bootstrapping with Pandas made easy"
+version = "0.2.1"
+description = "Statistical Bootstrap with Pandas made easy"
 authors = ["Will Dean <wd60622@gmail.com>"]
 readme = "README.md"
 license = "MIT"
@@ -30,7 +30,7 @@ classifiers = [
 python = ">=3.8,<4.0"
 pandas = ">=1.0.0"
 joblib = "^1.1.1"
-
+typing_extensions = { version = "*", python = "<3.10" }
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.0"

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -0,0 +1,26 @@
+import pandas as pd 
+
+from bootstrap.datasets import different_mean_and_sigma
+
+
+def test_different_mean_and_sigma() -> None: 
+    n = 10
+    df = different_mean_and_sigma(n=n, random_state=0)
+
+    assert isinstance(df, pd.DataFrame)
+    assert df.shape == (n, 2)
+    assert df.columns.tolist() == ["group", "x"]
+
+
+def test_different_mean_and_sigma_with_mu_sigma() -> None: 
+    n = 1000
+    mu = [0, 1]
+    sigma = [1, 2]
+    df = different_mean_and_sigma(n=n, random_state=0, mu=mu, sigma=sigma)
+
+    mean = df.groupby('group')["x"].mean()
+    std = df.groupby("group")["x"].std()
+
+    assert mean[0] < mean[1]
+    assert std[0] < std[1]
+