black readme and update docstring

awslabs · Nov 12, 2023 · 8c740e9 · 8c740e9
1 parent 5f4b638
commit 8c740e9
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -43,15 +43,16 @@ print(clf.score())
 
 ## Usage
 
-For slower but more stable results select `intersection_union_mapper` to combine embedding layers via third UMAP.
-Be sure that random seeds are set too!
+For a slower but more **stable** results select `intersection_union_mapper` to combine embedding layers via a third UMAP, which will provide equal weight to both numerics and categoriel columns. By default, you are setting the random seed which eliminates the ability for UMAP to run in parallel but will help circumevent some of [the randomness](https://umap-learn.readthedocs.io/en/latest/reproducibility.html) of the algorithm.
 
 ```python
 clf = DenseClus(
     umap_combine_method="intersection_union_mapper",
 )
 ```
 
+### Advanced Usage
+
 For advanced users, it's possible to select more fine-grained control of the underlying algorithms by passing
 dictionaries into `DenseClus` class.
 
@@ -60,23 +61,43 @@ For example:
 from denseclus import DenseClus
 from denseclus.utils import make_dataframe
 
-umap_params = {'categorical': {'n_neighbors': 15, 'min_dist': 0.1},
-              'numerical': {'n_neighbors': 20, 'min_dist': 0.1}}
-hdbscan_params = {'min_cluster_size': 10}
+umap_params = {
+    "categorical": {"n_neighbors": 15, "min_dist": 0.1},
+    "numerical": {"n_neighbors": 20, "min_dist": 0.1},
+}
+hdbscan_params = {"min_cluster_size": 10}
 
 df = make_dataframe()
 
 clf = DenseClus(umap_combine_method="union"
-              ,umap_params=umap_params
-              ,hdbscan_params=hdbscan_params)
+             , umap_params=umap_params
+             , hdbscan_params=hdbscan_params
+             , random_state=None) # this will run in parallel
 
 clf.fit(df)
 ```
 
 
 ## Examples
 
-A hands-on example with an overview of how to use is currently available in the form of a [Jupyter Notebook](/notebooks/DenseClus%20Example%20NB.ipynb).
+### Notebooks
+
+A hands-on example with an overview of how to use is currently available in the form of a [Example Jupyter Notebook](/notebooks/01_DenseClusExampleNB.ipynb).
+
+Should you need to tune HDBSCAN, here is an optional approach: [Tuning with HDBSCAN Notebook](/notebooks/02_TuningwithHDBSCAN.ipynb)
+
+Should you need to validate UMAP emeddings, there is an approach to do so in the [Validation for UMAP Notebook](/notebooks/03_ValidationForUMAP.ipynb)
+
+### Blogs
+
+
+[AWS Blog: Introducing DenseClus, an open source clustering package for mixed-type data](https://aws.amazon.com/blogs/opensource/introducing-denseclus-an-open-source-clustering-package-for-mixed-type-data/)
+
+[TDS Blog: How To Tune HDBSCAN](https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970)
+
+[TDS Blog: On the Validation of UMAP](https://towardsdatascience.com/on-the-validating-umap-embeddings-2c8907588175)
+
+
 
 ## References
 

diff --git a/denseclus/DenseClus.py b/denseclus/DenseClus.py
@@ -52,21 +52,38 @@ class DenseClus(BaseEstimator, ClassifierMixin):
 
     Parameters
     ----------
-            random_state : int, default=None
+            random_state : int, default=42
                 Random State for both UMAP and numpy.random.
                 If set to None UMAP will run in Numba in multicore mode but
                 results may vary between runs.
                 Setting a seed may help to offset the stochastic nature of
                 UMAP by setting it with fixed random seed.
 
-            umap_combine_method : str, default=intersection
+            umap_combine_method : str, default=contrast
                 Method by which to combine embeddings spaces.
                 Options include: intersection, union, contrast,
-                intersection_union_mapper
-                The latter combines both the intersection and union of
-                the embeddings.
-                See:
-                https://umap-learn.readthedocs.io/en/latest/composing_models.html
+                methods for combining the embeddings: including
+                'intersection', 'union', 'contrast', and 'intersection_union_mapper'.
+
+                'intersection' preserves the numerical embeddings more, focusing on the quantitative aspects of
+                the data. This method is particularly useful when the numerical data is of higher importance or
+                relevance to the clustering task.
+
+                'Union' preserves the categorical embeddings more, emphasizing the qualitative aspects of the
+                data. This method is ideal when the categorical data carries significant weight or importance in
+                the clustering task.
+
+                'Contrast' highlights the differences between the numerical and categorical embeddings, providing
+                a more balanced representation of both. This method is particularly useful when there are
+                significant differences between the numerical and categorical data, and both types of data are
+                equally important for the clustering task.
+
+                'Intersection_union_mapper' is a hybrid method that combines the strengths of both 'intersection'
+                and 'union'. It first applies the 'intersection' method to preserve the numerical embeddings, then
+                applies the 'union' method to preserve the categorical embeddings. This method is useful when both
+                numerical and categorical data are important, but one type of data is not necessarily more
+                important than the other.
+                See: https://umap-learn.readthedocs.io/en/latest/composing_models.html
 
             prediction_data: bool, default=False
                 Whether to generate extra cached data for predicting labels or
@@ -105,7 +122,7 @@ class DenseClus(BaseEstimator, ClassifierMixin):
     def __init__(
         self,
         random_state: int = 42,
-        umap_combine_method: str = "intersection",
+        umap_combine_method: str = "contrast",
         prediction_data: bool = False,
         verbose: bool = False,
         umap_params=None,

diff --git a/notebooks/DenseClusExampleNB.ipynb → notebooks/01_DenseClusExampleNB.ipynb b/notebooks/DenseClusExampleNB.ipynb → notebooks/01_DenseClusExampleNB.ipynb
diff --git a/notebooks/TuningwithHDBSCAN.ipynb → notebooks/02_TuningWithHDBSCAN.ipynb b/notebooks/TuningwithHDBSCAN.ipynb → notebooks/02_TuningWithHDBSCAN.ipynb
diff --git a/notebooks/ValidationForUMAP.ipynb → notebooks/03_ValidationForUMAP.ipynb b/notebooks/ValidationForUMAP.ipynb → notebooks/03_ValidationForUMAP.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,7 +41,7 @@ target-version = "py311"
 fix = true
 unfixable = []
 select = ["E", "W"]
-ignore = ["E203", "E231", "E402", "E712", "F401"]
+ignore = ["E203", "E231", "E402", "E712", "F401","E501"]
 exclude = [
     '.git',
     '__pycache__',