From 211577d166d37c5ea736b535deecd10c0952ce24 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 16 Apr 2021 11:31:20 +0100 Subject: [PATCH 1/9] Make fit_step api public and other minor improvements --- wellcomeml/ml/clustering.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index 6c6b36c5..fa4245ca 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -113,6 +113,8 @@ class that is a sklearn.base.ClusterMixin 'random_state'): self.clustering_class.random_state = clustering_random_state + self.embedded_points = None + self.reduced_points = None self.cluster_ids = None self.cluster_names = None self.cluster_kws = None @@ -131,22 +133,26 @@ def fit(self, X, *_): A TextClustering object """ - self._fit_step(X, step='vectorizer') - self._fit_step(step='reducer') - self._fit_step(step='clustering') + self.fit_step(X, step='vectorizer') + self.fit_step(step='reducer') + self.fit_step(step='clustering') if self.embedding == 'tf-idf' and self.n_kw: self._find_keywords(self.embedded_points.toarray(), n_kw=self.n_kw) return self - def _fit_step(self, X=None, step='vectorizer'): + def fit_step(self, X=None, y=None, step='vectorizer'): """Internal function for partial fitting only a certain step""" if step == 'vectorizer': self.embedded_points = self.vectorizer.fit_transform(X) elif step == 'reducer': + if self.embedded_points is None: + raise ValueError( + 'You must embed/vectorise the points before reducing dimensionality' + ) self.reduced_points = \ - self.reducer_class.fit_transform(self.embedded_points) + self.reducer_class.fit_transform(self.embedded_points, y=y) elif step == 'clustering': points = ( self.reduced_points if self.cluster_reduced else @@ -239,6 +245,7 @@ def optimise(self, X, param_grid, n_cluster_range=None, max_noise=0.2, param_grid.get('clustering', {}).items()} } + grid = GridSearchCV( estimator=pipeline, param_grid=params, @@ -260,7 +267,9 @@ def optimise(self, X, param_grid, n_cluster_range=None, max_noise=0.2, # Prunes result to actually optimise under constraints best_silhouette = 0 best_params = {} + grid.fit(X, y=None) + for params, silhouette, noise, n_clusters in zip( grid.cv_results_['params'], grid.cv_results_['mean_test_silhouette'], From 77f3cb99aa24c31c96f1995ae78cd04ba0f78724 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 16 Apr 2021 12:07:02 +0100 Subject: [PATCH 2/9] Save and load --- wellcomeml/ml/clustering.py | 40 ++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index fa4245ca..adb21aa9 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -1,6 +1,7 @@ from collections import defaultdict import logging import os +import pickle from wellcomeml.ml import vectorizer from wellcomeml.logger import logger @@ -245,7 +246,7 @@ def optimise(self, X, param_grid, n_cluster_range=None, max_noise=0.2, param_grid.get('clustering', {}).items()} } - + grid = GridSearchCV( estimator=pipeline, param_grid=params, @@ -301,6 +302,43 @@ def optimise(self, X, param_grid, n_cluster_range=None, max_noise=0.2, return best_params + def save(self, folder, embedded_points=True, reduced_points=True, clustering_model=True, + create_folder=True): + """ + Saves the different steps of the pipeline + + Args: + folder(str): path to folder + embedded_points(bool): whether to save embedded/vectorized points as a .npy file + reduced_points(bool): whether to save reduced (2D) points as a .npy file + clustering_model(bool): whether to save the clustering model as a .pkl file + create_folder(bool): whether to creat the folder in case it doesn't exits + + Returns: + + """ + if create_folder: + os.makedirs(folder, exist_ok=True) + + if embedded_points: + np.save(os.path.join(folder, 'embedded_points.npy'), self.embedded_points) + if reduced_points: + np.save(os.path.join(folder, 'reduced_points.npy'), self.reduced_points) + if clustering_model: + with open(os.path.join(folder, 'clustering.pkl'), 'wb') as f: + pickle.dump(self.clustering_class, f) + + def load(self, folder, embedded_points=True, reduced_points=True, clustering_model=True): + if embedded_points: + self.embedded_points = np.load(os.path.join(folder, 'embedded_points.npy'), + allow_pickle=True) + if reduced_points: + self.reduced_points = np.load(os.path.join(folder, 'reduced_points.npy'), + allow_pickle=True) + if clustering_model: + with open(os.path.join(folder, 'clustering.pickle'), 'rb') as f: + self.clustering_class = pickle.load(f) + def stability(self): """Function to calculate how stable the clusters are""" raise NotImplementedError From 022e9a3b8ea9f622ec4418b0684a2aa2341a02a1 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 16 Apr 2021 16:29:10 +0100 Subject: [PATCH 3/9] Add custom save and load (note that you can just pickle the class, but that save and load helps you deal with partial pipelines) --- wellcomeml/ml/clustering.py | 72 +++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index adb21aa9..0bcd2fcc 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -40,6 +40,13 @@ class TextClustering(object): cluster_names: Names of the clusters cluster_kws: Keywords for the clusters (only if embedding=tf-idf) """ + # A class parameter to list all possible components + components = ['embbedded_points', + 'reduced_points', + 'vectorizer', + 'reducer', + 'clustering_model'] + def __init__(self, embedding='tf-idf', reducer='umap', clustering='dbscan', cluster_reduced=True, n_kw=10, params={}, embedding_random_state=None, reducer_random_state=None, @@ -152,8 +159,10 @@ def fit_step(self, X=None, y=None, step='vectorizer'): raise ValueError( 'You must embed/vectorise the points before reducing dimensionality' ) - self.reduced_points = \ - self.reducer_class.fit_transform(self.embedded_points, y=y) + if X is None: + X = self.embedded_points + + self.reduced_points = self.reducer_class.fit_transform(X=X, y=y) elif step == 'clustering': points = ( self.reduced_points if self.cluster_reduced else @@ -302,43 +311,70 @@ def optimise(self, X, param_grid, n_cluster_range=None, max_noise=0.2, return best_params - def save(self, folder, embedded_points=True, reduced_points=True, clustering_model=True, - create_folder=True): + def save(self, folder, components='all', create_folder=True): """ Saves the different steps of the pipeline Args: folder(str): path to folder - embedded_points(bool): whether to save embedded/vectorized points as a .npy file - reduced_points(bool): whether to save reduced (2D) points as a .npy file - clustering_model(bool): whether to save the clustering model as a .pkl file - create_folder(bool): whether to creat the folder in case it doesn't exits - - Returns: + components(list or 'all'): List of components to save. Options are: 'embbedded_points', + 'reduced_points', 'vectorizer', 'reducer', and 'clustering_model'. By default, loads 'all' + (you can get all components by listing the class param TextClustering.components) """ if create_folder: os.makedirs(folder, exist_ok=True) - if embedded_points: + if components == 'all' or 'embedded_points' in components: np.save(os.path.join(folder, 'embedded_points.npy'), self.embedded_points) - if reduced_points: + + if components == 'all' or 'reduced_points' in components: np.save(os.path.join(folder, 'reduced_points.npy'), self.reduced_points) - if clustering_model: + + if components == 'all' or 'vectorizer' in components: + with open(os.path.join(folder, 'vectorizer.pkl'), 'wb') as f: + pickle.dump(self.vectorizer, f) + + if components == 'all' or 'reducer' in components: + with open(os.path.join(folder, 'reducer.pkl'), 'wb') as f: + pickle.dump(self.reducer_class, f) + + if components == 'all' or 'clustering_model' in components: with open(os.path.join(folder, 'clustering.pkl'), 'wb') as f: pickle.dump(self.clustering_class, f) - def load(self, folder, embedded_points=True, reduced_points=True, clustering_model=True): - if embedded_points: + def load(self, folder, components='all'): + """ + Loads different stes of the pipeline + Args: + folder: + components: + + Returns: + + """ + + if components == 'all' or 'embedded_points' in components: self.embedded_points = np.load(os.path.join(folder, 'embedded_points.npy'), allow_pickle=True) - if reduced_points: + + if components == 'all' or 'reduced_points' in components: self.reduced_points = np.load(os.path.join(folder, 'reduced_points.npy'), allow_pickle=True) - if clustering_model: - with open(os.path.join(folder, 'clustering.pickle'), 'rb') as f: + + if components == 'all' or 'vectorizer' in components: + with open(os.path.join(folder, 'vectorizer.pkl'), 'rb') as f: + self.vectorizer = pickle.load(f) + + if components == 'all' or 'reducer' in components: + with open(os.path.join(folder, 'reducer.pkl'), 'rb') as f: + self.reducer_class = pickle.load(f) + + if components == 'all' or 'clustering_model' in components: + with open(os.path.join(folder, 'clustering.pkl'), 'rb') as f: self.clustering_class = pickle.load(f) + def stability(self): """Function to calculate how stable the clusters are""" raise NotImplementedError From d64504e4ef25e008d32f778f3c8bce32f718a320 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 16 Apr 2021 17:08:43 +0100 Subject: [PATCH 4/9] Custom load for sparse arrays --- wellcomeml/ml/clustering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index 0bcd2fcc..183d7390 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -357,6 +357,8 @@ def load(self, folder, components='all'): if components == 'all' or 'embedded_points' in components: self.embedded_points = np.load(os.path.join(folder, 'embedded_points.npy'), allow_pickle=True) + if not self.embedded_points.shape: + self.embedded_points = self.embedded_points[()] if components == 'all' or 'reduced_points' in components: self.reduced_points = np.load(os.path.join(folder, 'reduced_points.npy'), @@ -374,7 +376,6 @@ def load(self, folder, components='all'): with open(os.path.join(folder, 'clustering.pkl'), 'rb') as f: self.clustering_class = pickle.load(f) - def stability(self): """Function to calculate how stable the clusters are""" raise NotImplementedError From 52ead922534339f721364d23acb6e3f754c9bf0c Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 22 Apr 2021 08:01:22 +0100 Subject: [PATCH 5/9] Improve docs --- wellcomeml/ml/clustering.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index 183d7390..f1bc45c6 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -40,12 +40,6 @@ class TextClustering(object): cluster_names: Names of the clusters cluster_kws: Keywords for the clusters (only if embedding=tf-idf) """ - # A class parameter to list all possible components - components = ['embbedded_points', - 'reduced_points', - 'vectorizer', - 'reducer', - 'clustering_model'] def __init__(self, embedding='tf-idf', reducer='umap', clustering='dbscan', cluster_reduced=True, n_kw=10, params={}, @@ -345,12 +339,13 @@ def save(self, folder, components='all', create_folder=True): def load(self, folder, components='all'): """ - Loads different stes of the pipeline - Args: - folder: - components: + Loads the different steps of the pipeline - Returns: + Args: + folder(str): path to folder + components(list or 'all'): List of components to load. Options are: 'embbedded_points', + 'reduced_points', 'vectorizer', 'reducer', and 'clustering_model'. By default, loads 'all' + (you can get all components by listing the class param TextClustering.components) """ From 02bf6024302a63f1f6725e524a60ee083a1cd0ee Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 22 Apr 2021 08:04:47 +0100 Subject: [PATCH 6/9] Flake8 --- wellcomeml/ml/clustering.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index f1bc45c6..0ba45f32 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -249,7 +249,6 @@ def optimise(self, X, param_grid, n_cluster_range=None, max_noise=0.2, param_grid.get('clustering', {}).items()} } - grid = GridSearchCV( estimator=pipeline, param_grid=params, @@ -312,8 +311,9 @@ def save(self, folder, components='all', create_folder=True): Args: folder(str): path to folder components(list or 'all'): List of components to save. Options are: 'embbedded_points', - 'reduced_points', 'vectorizer', 'reducer', and 'clustering_model'. By default, loads 'all' - (you can get all components by listing the class param TextClustering.components) + 'reduced_points', 'vectorizer', 'reducer', and 'clustering_model'. By default, loads + 'all' (you can get all components by listing the class param + TextClustering.components) """ if create_folder: @@ -344,8 +344,9 @@ def load(self, folder, components='all'): Args: folder(str): path to folder components(list or 'all'): List of components to load. Options are: 'embbedded_points', - 'reduced_points', 'vectorizer', 'reducer', and 'clustering_model'. By default, loads 'all' - (you can get all components by listing the class param TextClustering.components) + 'reduced_points', 'vectorizer', 'reducer', and 'clustering_model'. By default, loads + 'all' (you can get all components by listing the class param + TextClustering.components) """ From 6fa881fb745033099cc65d271ad697d5e58ee4b1 Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 22 Apr 2021 09:04:49 +0100 Subject: [PATCH 7/9] Add tests --- tests/test_clustering.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_clustering.py b/tests/test_clustering.py index 3ce10106..c47221c2 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize("reducer,cluster_reduced", [("tsne", True), ("umap", True), ("umap", False)]) -def test_full_pipeline(reducer, cluster_reduced): +def test_full_pipeline(reducer, cluster_reduced, tmp_path): cluster = TextClustering(reducer=reducer, cluster_reduced=cluster_reduced, embedding_random_state=42, reducer_random_state=43, @@ -23,6 +23,15 @@ def test_full_pipeline(reducer, cluster_reduced): assert len(cluster.cluster_kws) == len(cluster.cluster_ids) == 6 + cluster.save(folder=tmp_path) + + cluster_new = TextClustering() + cluster_new.load(folder=tmp_path) + + # Asserts all coordinates of the loaded points are equal + assert (cluster_new.embedded_points != cluster.embedded_points).sum() == 0 + assert (cluster_new.reduced_points != cluster.reduced_points).sum() == 0 + @pytest.mark.parametrize("reducer", ["tsne", "umap"]) def test_parameter_search(reducer): From 3ce111bcadee76647d0323efdb514f9dce9ef4c3 Mon Sep 17 00:00:00 2001 From: aCampello Date: Mon, 26 Apr 2021 11:57:22 +0100 Subject: [PATCH 8/9] Reduce code duplication --- wellcomeml/ml/clustering.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/wellcomeml/ml/clustering.py b/wellcomeml/ml/clustering.py index 0ba45f32..1943c7f0 100644 --- a/wellcomeml/ml/clustering.py +++ b/wellcomeml/ml/clustering.py @@ -124,6 +124,12 @@ class that is a sklearn.base.ClusterMixin self.silhouette = None self.optimise_results = {} + self.embedded_points_filename = 'embedded_points.npy' + self.reduced_points_filename = 'reduced_points.npy' + self.vectorizer_filename = 'vectorizer.pkl' + self.reducer_filename = 'reducer.pkl' + self.clustering_filename = 'clustering.pkl' + def fit(self, X, *_): """ Fits all clusters in the pipeline @@ -320,21 +326,21 @@ def save(self, folder, components='all', create_folder=True): os.makedirs(folder, exist_ok=True) if components == 'all' or 'embedded_points' in components: - np.save(os.path.join(folder, 'embedded_points.npy'), self.embedded_points) + np.save(os.path.join(folder, self.embedded_points_filename), self.embedded_points) if components == 'all' or 'reduced_points' in components: - np.save(os.path.join(folder, 'reduced_points.npy'), self.reduced_points) + np.save(os.path.join(folder, self.reduced_points_filename), self.reduced_points) if components == 'all' or 'vectorizer' in components: - with open(os.path.join(folder, 'vectorizer.pkl'), 'wb') as f: + with open(os.path.join(folder, self.vectorizer_filename), 'wb') as f: pickle.dump(self.vectorizer, f) if components == 'all' or 'reducer' in components: - with open(os.path.join(folder, 'reducer.pkl'), 'wb') as f: + with open(os.path.join(folder, self.reducer_filename), 'wb') as f: pickle.dump(self.reducer_class, f) if components == 'all' or 'clustering_model' in components: - with open(os.path.join(folder, 'clustering.pkl'), 'wb') as f: + with open(os.path.join(folder, self.clustering_filename), 'wb') as f: pickle.dump(self.clustering_class, f) def load(self, folder, components='all'): @@ -351,25 +357,25 @@ def load(self, folder, components='all'): """ if components == 'all' or 'embedded_points' in components: - self.embedded_points = np.load(os.path.join(folder, 'embedded_points.npy'), + self.embedded_points = np.load(os.path.join(folder, self.embedded_points_filename), allow_pickle=True) if not self.embedded_points.shape: self.embedded_points = self.embedded_points[()] if components == 'all' or 'reduced_points' in components: - self.reduced_points = np.load(os.path.join(folder, 'reduced_points.npy'), + self.reduced_points = np.load(os.path.join(folder, self.reduced_points_filename), allow_pickle=True) if components == 'all' or 'vectorizer' in components: - with open(os.path.join(folder, 'vectorizer.pkl'), 'rb') as f: + with open(os.path.join(folder, self.vectorizer_filename), 'rb') as f: self.vectorizer = pickle.load(f) if components == 'all' or 'reducer' in components: - with open(os.path.join(folder, 'reducer.pkl'), 'rb') as f: + with open(os.path.join(folder, self.reducer_filename), 'rb') as f: self.reducer_class = pickle.load(f) if components == 'all' or 'clustering_model' in components: - with open(os.path.join(folder, 'clustering.pkl'), 'rb') as f: + with open(os.path.join(folder, self.clustering_filename), 'rb') as f: self.clustering_class = pickle.load(f) def stability(self): From 4003a7083670d2b7754fdb1868bc62c700ab561a Mon Sep 17 00:00:00 2001 From: aCampello Date: Mon, 26 Apr 2021 11:57:34 +0100 Subject: [PATCH 9/9] Add new test that it re-loads the correct class --- tests/test_clustering.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_clustering.py b/tests/test_clustering.py index c47221c2..c8ea040e 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -31,6 +31,8 @@ def test_full_pipeline(reducer, cluster_reduced, tmp_path): # Asserts all coordinates of the loaded points are equal assert (cluster_new.embedded_points != cluster.embedded_points).sum() == 0 assert (cluster_new.reduced_points != cluster.reduced_points).sum() == 0 + assert cluster_new.reducer_class.__class__ == cluster.reducer_class.__class__ + assert cluster_new.clustering_class.__class__ == cluster.clustering_class.__class__ @pytest.mark.parametrize("reducer", ["tsne", "umap"])