From 8c740e9db24d7aa6f936068cd021de912ac08fc6 Mon Sep 17 00:00:00 2001
From: Charles Frenzel <frenzcha@amazon.com>
Date: Sun, 12 Nov 2023 10:28:11 -0500
Subject: [PATCH] black readme and update docstring

---
 README.md                                     | 37 +++++++++++++++----
 denseclus/DenseClus.py                        | 33 +++++++++++++----
 ...leNB.ipynb => 01_DenseClusExampleNB.ipynb} |  0
 ...BSCAN.ipynb => 02_TuningWithHDBSCAN.ipynb} |  0
 ...rUMAP.ipynb => 03_ValidationForUMAP.ipynb} |  0
 pyproject.toml                                |  2 +-
 6 files changed, 55 insertions(+), 17 deletions(-)
 rename notebooks/{DenseClusExampleNB.ipynb => 01_DenseClusExampleNB.ipynb} (100%)
 rename notebooks/{TuningwithHDBSCAN.ipynb => 02_TuningWithHDBSCAN.ipynb} (100%)
 rename notebooks/{ValidationForUMAP.ipynb => 03_ValidationForUMAP.ipynb} (100%)

diff --git a/README.md b/README.md
index cd24488..6d0e82a 100644
--- a/README.md
+++ b/README.md
@@ -43,8 +43,7 @@ print(clf.score())
 
 ## Usage
 
-For slower but more stable results select `intersection_union_mapper` to combine embedding layers via third UMAP.
-Be sure that random seeds are set too!
+For a slower but more **stable** results select `intersection_union_mapper` to combine embedding layers via a third UMAP, which will provide equal weight to both numerics and categoriel columns. By default, you are setting the random seed which eliminates the ability for UMAP to run in parallel but will help circumevent some of [the randomness](https://umap-learn.readthedocs.io/en/latest/reproducibility.html) of the algorithm.
 
 ```python
 clf = DenseClus(
@@ -52,6 +51,8 @@ clf = DenseClus(
 )
 ```
 
+### Advanced Usage
+
 For advanced users, it's possible to select more fine-grained control of the underlying algorithms by passing
 dictionaries into `DenseClus` class.
 
@@ -60,15 +61,18 @@ For example:
 from denseclus import DenseClus
 from denseclus.utils import make_dataframe
 
-umap_params = {'categorical': {'n_neighbors': 15, 'min_dist': 0.1},
-              'numerical': {'n_neighbors': 20, 'min_dist': 0.1}}
-hdbscan_params = {'min_cluster_size': 10}
+umap_params = {
+    "categorical": {"n_neighbors": 15, "min_dist": 0.1},
+    "numerical": {"n_neighbors": 20, "min_dist": 0.1},
+}
+hdbscan_params = {"min_cluster_size": 10}
 
 df = make_dataframe()
 
 clf = DenseClus(umap_combine_method="union"
-              ,umap_params=umap_params
-              ,hdbscan_params=hdbscan_params)
+             , umap_params=umap_params
+             , hdbscan_params=hdbscan_params
+             , random_state=None) # this will run in parallel
 
 clf.fit(df)
 ```
@@ -76,7 +80,24 @@ clf.fit(df)
 
 ## Examples
 
-A hands-on example with an overview of how to use is currently available in the form of a [Jupyter Notebook](/notebooks/DenseClus%20Example%20NB.ipynb).
+### Notebooks
+
+A hands-on example with an overview of how to use is currently available in the form of a [Example Jupyter Notebook](/notebooks/01_DenseClusExampleNB.ipynb).
+
+Should you need to tune HDBSCAN, here is an optional approach: [Tuning with HDBSCAN Notebook](/notebooks/02_TuningwithHDBSCAN.ipynb)
+
+Should you need to validate UMAP emeddings, there is an approach to do so in the [Validation for UMAP Notebook](/notebooks/03_ValidationForUMAP.ipynb)
+
+### Blogs
+
+
+[AWS Blog: Introducing DenseClus, an open source clustering package for mixed-type data](https://aws.amazon.com/blogs/opensource/introducing-denseclus-an-open-source-clustering-package-for-mixed-type-data/)
+
+[TDS Blog: How To Tune HDBSCAN](https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970)
+
+[TDS Blog: On the Validation of UMAP](https://towardsdatascience.com/on-the-validating-umap-embeddings-2c8907588175)
+
+
 
 ## References
 
diff --git a/denseclus/DenseClus.py b/denseclus/DenseClus.py
index bcb0c42..d554381 100644
--- a/denseclus/DenseClus.py
+++ b/denseclus/DenseClus.py
@@ -52,21 +52,38 @@ class DenseClus(BaseEstimator, ClassifierMixin):
 
     Parameters
     ----------
-            random_state : int, default=None
+            random_state : int, default=42
                 Random State for both UMAP and numpy.random.
                 If set to None UMAP will run in Numba in multicore mode but
                 results may vary between runs.
                 Setting a seed may help to offset the stochastic nature of
                 UMAP by setting it with fixed random seed.
 
-            umap_combine_method : str, default=intersection
+            umap_combine_method : str, default=contrast
                 Method by which to combine embeddings spaces.
                 Options include: intersection, union, contrast,
-                intersection_union_mapper
-                The latter combines both the intersection and union of
-                the embeddings.
-                See:
-                https://umap-learn.readthedocs.io/en/latest/composing_models.html
+                methods for combining the embeddings: including
+                'intersection', 'union', 'contrast', and 'intersection_union_mapper'.
+
+                'intersection' preserves the numerical embeddings more, focusing on the quantitative aspects of
+                the data. This method is particularly useful when the numerical data is of higher importance or
+                relevance to the clustering task.
+
+                'Union' preserves the categorical embeddings more, emphasizing the qualitative aspects of the
+                data. This method is ideal when the categorical data carries significant weight or importance in
+                the clustering task.
+
+                'Contrast' highlights the differences between the numerical and categorical embeddings, providing
+                a more balanced representation of both. This method is particularly useful when there are
+                significant differences between the numerical and categorical data, and both types of data are
+                equally important for the clustering task.
+
+                'Intersection_union_mapper' is a hybrid method that combines the strengths of both 'intersection'
+                and 'union'. It first applies the 'intersection' method to preserve the numerical embeddings, then
+                applies the 'union' method to preserve the categorical embeddings. This method is useful when both
+                numerical and categorical data are important, but one type of data is not necessarily more
+                important than the other.
+                See: https://umap-learn.readthedocs.io/en/latest/composing_models.html
 
             prediction_data: bool, default=False
                 Whether to generate extra cached data for predicting labels or
@@ -105,7 +122,7 @@ class DenseClus(BaseEstimator, ClassifierMixin):
     def __init__(
         self,
         random_state: int = 42,
-        umap_combine_method: str = "intersection",
+        umap_combine_method: str = "contrast",
         prediction_data: bool = False,
         verbose: bool = False,
         umap_params=None,
diff --git a/notebooks/DenseClusExampleNB.ipynb b/notebooks/01_DenseClusExampleNB.ipynb
similarity index 100%
rename from notebooks/DenseClusExampleNB.ipynb
rename to notebooks/01_DenseClusExampleNB.ipynb
diff --git a/notebooks/TuningwithHDBSCAN.ipynb b/notebooks/02_TuningWithHDBSCAN.ipynb
similarity index 100%
rename from notebooks/TuningwithHDBSCAN.ipynb
rename to notebooks/02_TuningWithHDBSCAN.ipynb
diff --git a/notebooks/ValidationForUMAP.ipynb b/notebooks/03_ValidationForUMAP.ipynb
similarity index 100%
rename from notebooks/ValidationForUMAP.ipynb
rename to notebooks/03_ValidationForUMAP.ipynb
diff --git a/pyproject.toml b/pyproject.toml
index b2aabe4..78126ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ target-version = "py311"
 fix = true
 unfixable = []
 select = ["E", "W"]
-ignore = ["E203", "E231", "E402", "E712", "F401"]
+ignore = ["E203", "E231", "E402", "E712", "F401","E501"]
 exclude = [
     '.git',
     '__pycache__',