From 8c740e9db24d7aa6f936068cd021de912ac08fc6 Mon Sep 17 00:00:00 2001 From: Charles Frenzel Date: Sun, 12 Nov 2023 10:28:11 -0500 Subject: [PATCH] black readme and update docstring --- README.md | 37 +++++++++++++++---- denseclus/DenseClus.py | 33 +++++++++++++---- ...leNB.ipynb => 01_DenseClusExampleNB.ipynb} | 0 ...BSCAN.ipynb => 02_TuningWithHDBSCAN.ipynb} | 0 ...rUMAP.ipynb => 03_ValidationForUMAP.ipynb} | 0 pyproject.toml | 2 +- 6 files changed, 55 insertions(+), 17 deletions(-) rename notebooks/{DenseClusExampleNB.ipynb => 01_DenseClusExampleNB.ipynb} (100%) rename notebooks/{TuningwithHDBSCAN.ipynb => 02_TuningWithHDBSCAN.ipynb} (100%) rename notebooks/{ValidationForUMAP.ipynb => 03_ValidationForUMAP.ipynb} (100%) diff --git a/README.md b/README.md index cd24488..6d0e82a 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,7 @@ print(clf.score()) ## Usage -For slower but more stable results select `intersection_union_mapper` to combine embedding layers via third UMAP. -Be sure that random seeds are set too! +For a slower but more **stable** results select `intersection_union_mapper` to combine embedding layers via a third UMAP, which will provide equal weight to both numerics and categoriel columns. By default, you are setting the random seed which eliminates the ability for UMAP to run in parallel but will help circumevent some of [the randomness](https://umap-learn.readthedocs.io/en/latest/reproducibility.html) of the algorithm. ```python clf = DenseClus( @@ -52,6 +51,8 @@ clf = DenseClus( ) ``` +### Advanced Usage + For advanced users, it's possible to select more fine-grained control of the underlying algorithms by passing dictionaries into `DenseClus` class. @@ -60,15 +61,18 @@ For example: from denseclus import DenseClus from denseclus.utils import make_dataframe -umap_params = {'categorical': {'n_neighbors': 15, 'min_dist': 0.1}, - 'numerical': {'n_neighbors': 20, 'min_dist': 0.1}} -hdbscan_params = {'min_cluster_size': 10} +umap_params = { + "categorical": {"n_neighbors": 15, "min_dist": 0.1}, + "numerical": {"n_neighbors": 20, "min_dist": 0.1}, +} +hdbscan_params = {"min_cluster_size": 10} df = make_dataframe() clf = DenseClus(umap_combine_method="union" - ,umap_params=umap_params - ,hdbscan_params=hdbscan_params) + , umap_params=umap_params + , hdbscan_params=hdbscan_params + , random_state=None) # this will run in parallel clf.fit(df) ``` @@ -76,7 +80,24 @@ clf.fit(df) ## Examples -A hands-on example with an overview of how to use is currently available in the form of a [Jupyter Notebook](/notebooks/DenseClus%20Example%20NB.ipynb). +### Notebooks + +A hands-on example with an overview of how to use is currently available in the form of a [Example Jupyter Notebook](/notebooks/01_DenseClusExampleNB.ipynb). + +Should you need to tune HDBSCAN, here is an optional approach: [Tuning with HDBSCAN Notebook](/notebooks/02_TuningwithHDBSCAN.ipynb) + +Should you need to validate UMAP emeddings, there is an approach to do so in the [Validation for UMAP Notebook](/notebooks/03_ValidationForUMAP.ipynb) + +### Blogs + + +[AWS Blog: Introducing DenseClus, an open source clustering package for mixed-type data](https://aws.amazon.com/blogs/opensource/introducing-denseclus-an-open-source-clustering-package-for-mixed-type-data/) + +[TDS Blog: How To Tune HDBSCAN](https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970) + +[TDS Blog: On the Validation of UMAP](https://towardsdatascience.com/on-the-validating-umap-embeddings-2c8907588175) + + ## References diff --git a/denseclus/DenseClus.py b/denseclus/DenseClus.py index bcb0c42..d554381 100644 --- a/denseclus/DenseClus.py +++ b/denseclus/DenseClus.py @@ -52,21 +52,38 @@ class DenseClus(BaseEstimator, ClassifierMixin): Parameters ---------- - random_state : int, default=None + random_state : int, default=42 Random State for both UMAP and numpy.random. If set to None UMAP will run in Numba in multicore mode but results may vary between runs. Setting a seed may help to offset the stochastic nature of UMAP by setting it with fixed random seed. - umap_combine_method : str, default=intersection + umap_combine_method : str, default=contrast Method by which to combine embeddings spaces. Options include: intersection, union, contrast, - intersection_union_mapper - The latter combines both the intersection and union of - the embeddings. - See: - https://umap-learn.readthedocs.io/en/latest/composing_models.html + methods for combining the embeddings: including + 'intersection', 'union', 'contrast', and 'intersection_union_mapper'. + + 'intersection' preserves the numerical embeddings more, focusing on the quantitative aspects of + the data. This method is particularly useful when the numerical data is of higher importance or + relevance to the clustering task. + + 'Union' preserves the categorical embeddings more, emphasizing the qualitative aspects of the + data. This method is ideal when the categorical data carries significant weight or importance in + the clustering task. + + 'Contrast' highlights the differences between the numerical and categorical embeddings, providing + a more balanced representation of both. This method is particularly useful when there are + significant differences between the numerical and categorical data, and both types of data are + equally important for the clustering task. + + 'Intersection_union_mapper' is a hybrid method that combines the strengths of both 'intersection' + and 'union'. It first applies the 'intersection' method to preserve the numerical embeddings, then + applies the 'union' method to preserve the categorical embeddings. This method is useful when both + numerical and categorical data are important, but one type of data is not necessarily more + important than the other. + See: https://umap-learn.readthedocs.io/en/latest/composing_models.html prediction_data: bool, default=False Whether to generate extra cached data for predicting labels or @@ -105,7 +122,7 @@ class DenseClus(BaseEstimator, ClassifierMixin): def __init__( self, random_state: int = 42, - umap_combine_method: str = "intersection", + umap_combine_method: str = "contrast", prediction_data: bool = False, verbose: bool = False, umap_params=None, diff --git a/notebooks/DenseClusExampleNB.ipynb b/notebooks/01_DenseClusExampleNB.ipynb similarity index 100% rename from notebooks/DenseClusExampleNB.ipynb rename to notebooks/01_DenseClusExampleNB.ipynb diff --git a/notebooks/TuningwithHDBSCAN.ipynb b/notebooks/02_TuningWithHDBSCAN.ipynb similarity index 100% rename from notebooks/TuningwithHDBSCAN.ipynb rename to notebooks/02_TuningWithHDBSCAN.ipynb diff --git a/notebooks/ValidationForUMAP.ipynb b/notebooks/03_ValidationForUMAP.ipynb similarity index 100% rename from notebooks/ValidationForUMAP.ipynb rename to notebooks/03_ValidationForUMAP.ipynb diff --git a/pyproject.toml b/pyproject.toml index b2aabe4..78126ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ target-version = "py311" fix = true unfixable = [] select = ["E", "W"] -ignore = ["E203", "E231", "E402", "E712", "F401"] +ignore = ["E203", "E231", "E402", "E712", "F401","E501"] exclude = [ '.git', '__pycache__',