diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7bb2182..0e8bc5b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -3,7 +3,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10"] + python-version: ["3.10"] env: USING_COVERAGE: "3.10" diff --git a/Makefile b/Makefile index 31f051b..d7be722 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ lint: @echo "Running linting..." @$(BLACK) denseclus tests setup.py @$(RUFF) denseclus tests setup.py --fix --preview - @$(PYLINT) denseclus --disable=R0902,W0222,W0221,C0103 + @$(PYLINT) denseclus --disable=R0902,W0222,W0221,C0103,W0632 lint-notebooks: @echo "Linting notebooks..." @@ -62,6 +62,7 @@ clean: @echo "Cleaning..." @rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache @rm -rf .mypy_cache .ruff_cache .coverage build .tox + coverage erase help: @IFS=$$'\n' ; \ diff --git a/README.md b/README.md index 994ed03..cd24488 100644 --- a/README.md +++ b/README.md @@ -24,22 +24,56 @@ DenseClus is a Python module for clustering mixed type data using [UMAP](https:/ python3 -m pip install Amazon-DenseClus ``` -## Usage +## Quick Start DenseClus requires a Panda's dataframe as input with both numerical and categorical columns. All preprocessing and extraction are done under the hood, just call fit and then retrieve the clusters! ```python from denseclus import DenseClus +from denseclus.utils import make_dataframe + +df = make_dataframe() + +clf = DenseClus() +clf.fit(df) + +print(clf.score()) +``` + +## Usage +For slower but more stable results select `intersection_union_mapper` to combine embedding layers via third UMAP. +Be sure that random seeds are set too! + +```python clf = DenseClus( umap_combine_method="intersection_union_mapper", ) -clf.fit(df) +``` -print(clf.score()) +For advanced users, it's possible to select more fine-grained control of the underlying algorithms by passing +dictionaries into `DenseClus` class. + +For example: +```python +from denseclus import DenseClus +from denseclus.utils import make_dataframe + +umap_params = {'categorical': {'n_neighbors': 15, 'min_dist': 0.1}, + 'numerical': {'n_neighbors': 20, 'min_dist': 0.1}} +hdbscan_params = {'min_cluster_size': 10} + +df = make_dataframe() + +clf = DenseClus(umap_combine_method="union" + ,umap_params=umap_params + ,hdbscan_params=hdbscan_params) + +clf.fit(df) ``` + ## Examples A hands-on example with an overview of how to use is currently available in the form of a [Jupyter Notebook](/notebooks/DenseClus%20Example%20NB.ipynb). diff --git a/denseclus/DenseClus.py b/denseclus/DenseClus.py index 60a1cd5..bcb0c42 100644 --- a/denseclus/DenseClus.py +++ b/denseclus/DenseClus.py @@ -32,7 +32,6 @@ import numpy as np import pandas as pd import umap.umap_ as umap -from hdbscan import flat from sklearn.base import BaseEstimator, ClassifierMixin from .utils import extract_categorical, extract_numerical @@ -60,32 +59,6 @@ class DenseClus(BaseEstimator, ClassifierMixin): Setting a seed may help to offset the stochastic nature of UMAP by setting it with fixed random seed. - n_neighbors : int, default=30 - Level of neighbors for UMAP. - Setting this higher will generate higher densities at the expense - of requiring more computational complexity. - - min_samples : int, default=15 - Samples used for HDBSCAN. - The larger this is set the more noise points get declared and the - more restricted clusters become to only dense areas. - - min_cluster_size : int, default=100 - Minimum Cluster size for HDBSCAN. - The minimum number of points from which a cluster needs to be - formed. - - n_components : int, default=logarithm - Number of components for UMAP. - These are dimensions to reduce the data down to. - Ideally, this needs to be a value that preserves all the information - to form meaningful clusters. Default is the logarithm of total - number of features. - - cluster_selection_method: str, default=eom - The HDBSCAN selection method for how flat clusters are selected from - the cluster hierarchy. Defaults to EOM or Excess of Mass - umap_combine_method : str, default=intersection Method by which to combine embeddings spaces. Options include: intersection, union, contrast, @@ -107,32 +80,37 @@ class DenseClus(BaseEstimator, ClassifierMixin): Level of verbosity to print when fitting and predicting. Setting to False will only show Warnings that appear. - flat_clusters: bool, default=False - Instead of determining cluster size based on density, - the algorithm will attempt to partition the data into the specified - number of clusters and the resulting clusters will have a fixed size. - + umap_params : dict, optional + A dictionary containing dictionaries: 'categorical', 'numerical' and 'combined' if + 'intersection_union_mapper' is selected as the 'umap_combine_method'. + Each dictionary should contain parameters for the UMAP algorithm used to + fit the data. + If not provided, default UMAP parameters will be used. + + Example: + umap_params = { + 'categorical': {'n_neighbors': 15, 'min_dist': 0.1}, + 'numerical': {'n_neighbors': 20, 'min_dist': 0.2} + 'combined' : {'n_neighbors': 5, 'min_dist': 0.1} + } + + hdbscan_params : dict, optional + A dictionary containing parameters for the HDBSCAN algorithm. + If not provided, default HDBSCAN parameters will be used. + + Example: + hdbscan_params = {'min_cluster_size': 10} """ def __init__( self, random_state: int = 42, - n_neighbors: int = 30, - min_samples: int = 15, - min_cluster_size: int = 100, - n_components: int = 5, - cluster_selection_method: str = "eom", umap_combine_method: str = "intersection", prediction_data: bool = False, verbose: bool = False, - flat_clusters: bool = False, + umap_params=None, + hdbscan_params=None, ): - if not isinstance(n_neighbors, int) or n_neighbors <= 0: - raise ValueError("n_neighbors must be a positive integer") - if not isinstance(min_samples, int) or min_samples <= 0: - raise ValueError("min_samples must be a positive integer") - if not isinstance(min_cluster_size, int) or min_cluster_size <= 0: - raise ValueError("min_cluster_size must be a positive integer") if umap_combine_method not in [ "intersection", "union", @@ -142,14 +120,58 @@ def __init__( raise ValueError("umap_combine_method must be valid selection") self.random_state = random_state - self.n_neighbors = n_neighbors - self.min_samples = min_samples - self.min_cluster_size = min_cluster_size - self.n_components = n_components - self.cluster_selection_method = cluster_selection_method self.umap_combine_method = umap_combine_method self.prediction_data = prediction_data - self.flat_clusters = flat_clusters + + # Default parameters + default_umap_params = { + "categorical": { + "metric": "dice", + "n_neighbors": 30, + "n_components": 5, + "min_dist": 0.0, + }, + "numerical": { + "metric": "l2", + "n_neighbors": 30, + "n_components": 5, + "min_dist": 0.0, + }, + "combined": { + "n_neighbors": 30, + "min_dist": 0.0, + "n_components": 5, + }, + } + + default_hdbscan_params = { + "min_cluster_size": 100, + "min_samples": 15, + "gen_min_span_tree": True, + "metric": "euclidean", + } + + # self.umap_params = dict() + # if umap_params is None: + # self.umap_params = default_umap_params + # else: + # for key, new_params in umap_params.items(): + # self.umap_params[key] = default_umap_params | new_params + if umap_params is None: + self.umap_params = default_umap_params + else: + for key in umap_params: + if key in default_umap_params: + default_umap_params[key].update(umap_params[key]) # type: ignore # noqa + else: + raise ValueError(f"Invalid key '{key}' in umap_params") + self.umap_params = default_umap_params + + if hdbscan_params is None: + self.hdbscan_params = default_hdbscan_params + else: + default_hdbscan_params.update(hdbscan_params) + self.hdbscan_params = default_hdbscan_params if verbose: logger.setLevel(logging.DEBUG) @@ -191,9 +213,6 @@ def fit(self, df: pd.DataFrame) -> None: if not isinstance(df, pd.DataFrame): raise TypeError("Requires DataFrame as input") - if not isinstance(self.n_components, int): - self.n_components = int(round(np.log(df.shape[1]))) - logger.info("Extracting categorical features") self.categorical_ = extract_categorical(df) @@ -223,13 +242,10 @@ def _fit_numerical(self): logger.info("Fitting UMAP for Numerical data") numerical_umap = umap.UMAP( - metric="l2", - n_neighbors=self.n_neighbors, - n_components=self.n_components, - min_dist=0.0, random_state=self.random_state, n_jobs=1 if self.random_state is not None else -1, verbose=False, + **self.umap_params["numerical"], ).fit(self.numerical_) self.numerical_umap_ = numerical_umap @@ -252,13 +268,10 @@ def _fit_categorical(self): logger.info("Fitting UMAP for categorical data") categorical_umap = umap.UMAP( - metric="dice", - n_neighbors=self.n_neighbors, - n_components=self.n_components, - min_dist=0.0, random_state=self.random_state, n_jobs=1 if self.random_state is not None else -1, verbose=False, + **self.umap_params["categorical"], ).fit(self.categorical_) self.categorical_umap_ = categorical_umap logger.info("Categorical UMAP fitted successfully") @@ -290,10 +303,8 @@ def _umap_embeddings(self): elif self.umap_combine_method == "intersection_union_mapper": intersection_mapper = umap.UMAP( random_state=self.random_state, - n_neighbors=self.n_neighbors, - n_components=self.n_components, - min_dist=0.0, n_jobs=1 if self.random_state is not None else -1, + **self.umap_params["combined"], ).fit(self.numerical_) self.mapper_ = intersection_mapper * (self.numerical_umap_ + self.categorical_umap_) @@ -313,30 +324,12 @@ def _fit_hdbscan(self): ------- self """ - # create clusters of a fixed size - if self.flat_clusters: - logger.info("Fitting HDBSCAN with flat clusters") - flat_model_ = flat.HDBSCAN_flat( - X=self.mapper_.embedding_, - cluster_selection_method=self.cluster_selection_method, - n_clusters=self.flat_clusters, - min_samples=self.min_samples, - metric="euclidean", - ) - - self.hdbscan_ = flat_model_ - # or find the ideal number of clusters based on the density - else: - logger.info("Fitting HDBSCAN with default parameters") - hdb_ = hdbscan.HDBSCAN( - min_samples=self.min_samples, - min_cluster_size=self.min_cluster_size, - cluster_selection_method=self.cluster_selection_method, - prediction_data=self.prediction_data, - gen_min_span_tree=True, - metric="euclidean", - ).fit(self.mapper_.embedding_) - self.hdbscan_ = hdb_ + logger.info("Fitting HDBSCAN with default parameters") + hdb_ = hdbscan.HDBSCAN( + prediction_data=self.prediction_data, + **self.hdbscan_params, + ).fit(self.mapper_.embedding_) + self.hdbscan_ = hdb_ logger.info("HDBSCAN fit") return self diff --git a/denseclus/utils.py b/denseclus/utils.py index c9dc904..41f15cd 100644 --- a/denseclus/utils.py +++ b/denseclus/utils.py @@ -8,6 +8,8 @@ import numpy as np import pandas as pd from sklearn.preprocessing import PowerTransformer +from sklearn.datasets import make_blobs +from sklearn.preprocessing import KBinsDiscretizer, StandardScaler def extract_categorical(df: pd.DataFrame) -> pd.DataFrame: @@ -63,3 +65,28 @@ def transform_numerics(numerical: pd.DataFrame) -> pd.DataFrame: numerical[name] = pt.fit_transform(np.array(numerical[name]).reshape(-1, 1)) return numerical + + +def make_dataframe() -> pd.DataFrame: + """This will create dataframe for demonstration purposes. + + Returns: + pd.DataFrame: dataframe of categorical and numerical data + """ + X, _ = make_blobs(n_samples=1000, n_features=8, random_state=10) # ruff: noqa: W0632 + numerical = StandardScaler().fit_transform(X[:, :6]) + categorical = KBinsDiscretizer(n_bins=3, encode="ordinal").fit_transform(X[:, 6:]) + categorical = np.where( + categorical == 1.0, + "M", + np.where(categorical == 2.0, "H", "L"), + ).astype(str) + + numerical_columns = [f"num_{i}" for i in range(numerical.shape[1])] + df = pd.DataFrame(numerical, columns=numerical_columns) + + categorical_columns = [f"cat_{i}" for i in range(categorical.shape[1])] + for idx, c in enumerate(categorical_columns): + df[c] = categorical[:, idx] + + return df diff --git a/setup.py b/setup.py index c568b29..8afb12b 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ #!/usr/bin/env/python3 import setuptools -import os with open("README.md", encoding="utf-8") as fh: long_description = fh.read() diff --git a/tests/conftest.py b/tests/conftest.py index 3367de1..ad935e8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,47 +5,29 @@ import pytest import numpy as np import pandas as pd -from sklearn.datasets import make_blobs -from sklearn.preprocessing import KBinsDiscretizer, StandardScaler from denseclus.DenseClus import DenseClus -import warnings +from denseclus.utils import make_dataframe -@pytest.fixture(params=[1, 2, 3, 10]) -def n_components(request): - return request.param +@pytest.fixture +def n_components(): + return 3 @pytest.fixture def df(): - n_clusters = 3 - X, y = make_blobs(n_samples=1000, n_features=8, random_state=10) - numerical = StandardScaler().fit_transform(X[:, :6]) - categorical = KBinsDiscretizer(n_bins=3, encode="ordinal").fit_transform(X[:, 6:]) - categorical = np.where( - categorical == 1.0, - "M", - np.where(categorical == 2.0, "H", "L"), - ).astype(str) - - numerical_columns = [f"num_{i}" for i in range(numerical.shape[1])] - df = pd.DataFrame(numerical, columns=numerical_columns) - - categorical_columns = [f"cat_{i}" for i in range(categorical.shape[1])] - for idx, c in enumerate(categorical_columns): - df[c] = categorical[:, idx] - + df = make_dataframe() return df @pytest.fixture -def clf(df): - clf = DenseClus( - n_components=3, - random_state=42, - n_neighbors=10, - umap_combine_method="intersection_union_mapper", - ) +def clf(df, n_components): + umap_params = { + "categorical": {"n_components": n_components}, + "numerical": {"n_components": n_components}, + "combined": {"n_components": n_components}, + } + clf = DenseClus(umap_combine_method="intersection_union_mapper", umap_params=umap_params) clf.fit(df) return clf diff --git a/tests/denseclus_test.py b/tests/denseclus_test.py index edef623..29701cb 100644 --- a/tests/denseclus_test.py +++ b/tests/denseclus_test.py @@ -2,24 +2,19 @@ import numpy as np import pandas as pd import pytest -from sklearn.datasets import make_blobs -from sklearn.preprocessing import KBinsDiscretizer, StandardScaler -import warnings from denseclus.DenseClus import DenseClus -def test_fit_categorical(n_components, df): - clf = DenseClus(n_components=n_components) - clf.fit(df) - assert clf.categorical_umap_.embedding_.shape == (len(df), n_components) +def test_fit_categorical(clf, df): + assert clf.categorical_umap_.embedding_.shape == (len(df), clf.categorical_umap_.n_components) def test_fit_numerical(clf, df): - assert clf.numerical_umap_.embedding_.shape == (len(df), clf.n_components) + assert clf.numerical_umap_.embedding_.shape == (len(df), clf.numerical_umap_.n_components) def test_umap_embeddings(clf, df): - assert clf.mapper_.embedding_.shape == (len(df), clf.n_components) + assert clf.mapper_.embedding_.shape == (len(df), clf.mapper_.n_components[-1]) def test_hdbscan_labels(clf, df): @@ -45,7 +40,6 @@ def test_repr(clf): def test_fit_known_output(df): - pass df_small = df.head(100) clf = DenseClus() clf.fit(df_small) diff --git a/tests/utils_test.py b/tests/utils_test.py index 73da184..9ec562f 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -1,6 +1,5 @@ import pandas as pd import pytest -import warnings from denseclus.utils import extract_categorical, extract_numerical, transform_numerics