Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Archipelago - dict transformer for vectorizing persistence diagrams #1017

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
de408e9
add archipelago class
Nov 28, 2023
927c789
give default quantiser sklearn.KMeans to Atol method
Dec 19, 2023
3e95419
homology_dimensions and settlers for Archipelago class
Dec 19, 2023
3bc232a
n_init parameter for sklearn KMeans
Dec 20, 2023
431a692
archipelago island
Dec 20, 2023
aaef59f
mature version compatible with gudhi.representations.vector_methods
Jan 4, 2024
adc8668
Atol try/catch
Jan 4, 2024
4685748
fix docstrings
Jan 4, 2024
2cf60e4
typo
Jan 4, 2024
6524320
Merge branch 'GUDHI:master' into archipelago
martinroyer Jan 4, 2024
5561d09
docstring correct
Jan 4, 2024
5367ea1
refactor removing input preprocessing, instead we take raw dgm format…
Jan 4, 2024
95bd156
prints
Jan 4, 2024
b4de687
Revert try/catch optimizer fit in Atol
martinroyer Jan 5, 2024
89be488
fix set_output from sklearn so as to return pandas without importing …
Jan 5, 2024
f5dc92d
default KMeans parameter
Jan 5, 2024
6bfb164
change confusing Atol __call__ function
Jan 5, 2024
23a5e47
define get_feature_names_out for Atol
Jan 5, 2024
a59af1b
test fixes
Jan 5, 2024
9fadd61
hopefully fix atol test following `n_init="auto"` in KMeans
Jan 5, 2024
0b1c6b9
revert value changes to doc
Jan 8, 2024
aa0d3cb
updated docstring
Jan 8, 2024
4afb0ef
tentative change n_init value for test compatibility 3.7
Jan 12, 2024
2df14d9
remove try except
Jan 12, 2024
f01ac19
call vectorizer get_geature_names_out if exists
Jan 12, 2024
c43b190
more sklearn logic
Jan 12, 2024
b74dcae
atol fixes:
Jun 7, 2024
26eef83
add test for representations interface fit/transform/...
Jun 7, 2024
5d8af99
remove archipelago
Jun 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 39 additions & 11 deletions src/python/gudhi/representations/vector_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import pairwise
from sklearn.cluster import KMeans

try:
# New location since 1.0
from sklearn.metrics import DistanceMetric
Expand Down Expand Up @@ -719,21 +721,26 @@ class Atol(BaseEstimator, TransformerMixin):
>>> a = np.array([[1, 2, 4], [1, 4, 0], [1, 0, 4]])
>>> b = np.array([[4, 2, 0], [4, 4, 0], [4, 0, 2]])
>>> c = np.array([[3, 2, -1], [1, 2, -1]])
>>> atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006))
>>> atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006, n_init=10))
Comment on lines -722 to +724
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VincentRouvreau I believe this is what you need part1

>>> atol_vectoriser.fit(X=[a, b, c]).centers
array([[ 2.6 , 2.8 , -0.4 ],
[ 2. , 0.66666667, 3.33333333]])
>>> atol_vectoriser(a)
>>> atol_vectoriser._transform(a)
array([0.42375966, 1.18168665])
>>> atol_vectoriser(c)
>>> atol_vectoriser._transform(c)
array([1.25157463, 0.02062512])
>>> atol_vectoriser.transform(X=[a, b, c])
array([[0.42375966, 1.18168665],
[1.06330156, 0.29861028],
[1.25157463, 0.02062512]])
"""
# Note the example above must be up to date with the one in tests called test_atol_doc
def __init__(self, quantiser, weighting_method="cloud", contrast="gaussian"):
def __init__(
self,
quantiser=KMeans(n_clusters=2, random_state=202312, n_init=10),
weighting_method="cloud",
contrast="gaussian"
):
"""
Constructor for the Atol measure vectorisation class.

Expand All @@ -751,6 +758,7 @@ def __init__(self, quantiser, weighting_method="cloud", contrast="gaussian"):
self.quantiser = quantiser
self.contrast = contrast
self.weighting_method = weighting_method
self._running_transform_names = ""

def get_contrast(self):
return {
Expand Down Expand Up @@ -780,18 +788,34 @@ def fit(self, X, y=None, sample_weight=None):
self
"""
if not hasattr(self.quantiser, 'fit'):
raise TypeError("quantiser %s has no `fit` attribute." % (self.quantiser))

# In fitting we remove infinite death time points so that every center is finite
X = [dgm[~np.isinf(dgm).any(axis=1), :] for dgm in X]
raise TypeError(f"quantiser {self.quantiser} has no `fit` attribute.")
n_clusters = self.quantiser.n_clusters

if sample_weight is None:
sample_weight = [self.get_weighting_method()(measure) for measure in X]

measures_concat = np.concatenate(X)
weights_concat = np.concatenate(sample_weight)
self.quantiser.fit(X=measures_concat, sample_weight=weights_concat)
# In fitting we remove infinite birth/death time points so that every center is finite
filtered_measures_concat = measures_concat[~np.isinf(measures_concat).any(axis=1), :] if len(measures_concat) else measures_concat
filtered_weights_concat = weights_concat[~np.isinf(measures_concat).any(axis=1)] if len(measures_concat) else weights_concat

if not len(filtered_measures_concat) or len(filtered_measures_concat) < n_clusters:
# If no point to fit, let's arbitrarily put centers in [0, 1)
print(f" [Atol] had {len(filtered_measures_concat)} points to fit {n_clusters} clusters,"
f" adding random points in [0, 1)^2")
filtered_weights_concat = np.concatenate((
filtered_weights_concat,
np.ones(shape=(n_clusters - len(filtered_measures_concat)))
))
filtered_measures_concat = np.concatenate((
filtered_measures_concat,
np.random.random((n_clusters - len(filtered_measures_concat), 2))
))

self.quantiser.fit(X=filtered_measures_concat, sample_weight=filtered_weights_concat)
self.centers = self.quantiser.cluster_centers_

# Hack, but some people are unhappy if the order depends on the version of sklearn
self.centers = self.centers[np.lexsort(self.centers.T)]
if self.quantiser.n_clusters == 1:
Expand All @@ -805,7 +829,7 @@ def fit(self, X, y=None, sample_weight=None):
self.inertias = np.min(dist_centers, axis=0)/2
return self

def __call__(self, measure, sample_weight=None):
def _transform(self, measure, sample_weight=None):
"""
Apply measure vectorisation on a single measure. Only available after `fit` has been called.

Expand Down Expand Up @@ -834,4 +858,8 @@ def transform(self, X, sample_weight=None):
"""
if sample_weight is None:
sample_weight = [self.get_weighting_method()(measure) for measure in X]
return np.stack([self(measure, sample_weight=weight) for measure, weight in zip(X, sample_weight)])
self._running_transform_names = [f"Atol Center {i + 1}" for i in range(self.quantiser.n_clusters)]
return np.stack([self._transform(measure, sample_weight=weight) for measure, weight in zip(X, sample_weight)])

def get_feature_names_out(self):
return self._running_transform_names
16 changes: 8 additions & 8 deletions src/python/test/test_representations.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,23 +118,23 @@ def test_atol_doc():
b = np.array([[4, 2, 0], [4, 4, 0], [4, 0, 2]])
c = np.array([[3, 2, -1], [1, 2, -1]])

atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006))
atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006, n_init=10))
# Atol will do
# X = np.concatenate([a,b,c])
# kmeans = KMeans(n_clusters=2, random_state=202006).fit(X)
# kmeans = KMeans(n_clusters=2, random_state=202006, n_init=10).fit(X)
# kmeans.labels_ will be : array([1, 0, 1, 0, 0, 1, 0, 0])
first_cluster = np.asarray([a[0], a[2], b[2]])
second_cluster = np.asarray([a[1], b[0], b[2], c[0], c[1]])
second_cluster = np.asarray([a[1], b[0], b[1], c[0], c[1]])
Comment on lines -121 to +127
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VincentRouvreau I believe this is what you need part2


# Check the center of the first_cluster and second_cluster are in Atol centers
centers = atol_vectoriser.fit(X=[a, b, c]).centers
np.isclose(centers, first_cluster.mean(axis=0)).all(1).any()
np.isclose(centers, second_cluster.mean(axis=0)).all(1).any()

vectorization = atol_vectoriser.transform(X=[a, b, c])
assert np.allclose(vectorization[0], atol_vectoriser(a))
assert np.allclose(vectorization[1], atol_vectoriser(b))
assert np.allclose(vectorization[2], atol_vectoriser(c))
assert np.allclose(vectorization[0], atol_vectoriser._transform(a))
assert np.allclose(vectorization[1], atol_vectoriser._transform(b))
assert np.allclose(vectorization[2], atol_vectoriser._transform(c))


def test_dummy_atol():
Expand All @@ -145,12 +145,12 @@ def test_dummy_atol():
for weighting_method in ["cloud", "iidproba"]:
for contrast in ["gaussian", "laplacian", "indicator"]:
atol_vectoriser = Atol(
quantiser=KMeans(n_clusters=1, random_state=202006),
quantiser=KMeans(n_clusters=1, random_state=202006, n_init=10),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VincentRouvreau I believe this is what you need part3/3

weighting_method=weighting_method,
contrast=contrast,
)
atol_vectoriser.fit([a, b, c])
atol_vectoriser(a)
atol_vectoriser._transform(a)
atol_vectoriser.transform(X=[a, b, c])


Expand Down
87 changes: 87 additions & 0 deletions src/python/test/test_representations_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from copy import deepcopy
import numpy as np

from sklearn.cluster import KMeans

from gudhi.representations import (Atol, Landscape, Silhouette, BettiCurve, ComplexPolynomial, \
TopologicalVector, PersistenceImage, Entropy)

vectorizers = {
"atol": Atol(quantiser=KMeans(n_clusters=2, random_state=202312, n_init="auto")),
# "betti": BettiCurve(),
}

diag1 = [np.array([[0., np.inf],
[0., 8.94427191],
[0., 7.28010989],
[0., 6.08276253],
[0., 5.83095189],
[0., 5.38516481],
[0., 5.]]),
np.array([[11., np.inf],
[6.32455532, 6.70820393]]),
np.empty(shape=[0, 2])]

diag2 = [np.array([[0., np.inf],
[0., 8.94427191],
[0., 7.28010989],
[0., 6.08276253],
[0., 5.83095189],
[0., 5.38516481],
[0., 5.]]),
np.array([[11., np.inf],
[6.32455532, 6.70820393]]),
np.array([[0., np.inf],
[0., 1]])]

diag3 = [np.empty(shape=[0, 2])]


def test_fit():
print(f" > Testing `fit`.")
for name, vectorizer in vectorizers.items():
print(f" >> Testing {name}")
deepcopy(vectorizer).fit(X=[diag1[0], diag2[0]])


def test_fit_empty():
print(f" > Testing `fit_empty`.")
for name, vectorizer in vectorizers.items():
print(f" >> Testing {name}")
deepcopy(vectorizer).fit(X=[diag3[0], diag3[0]])


def test_transform():
print(f" > Testing `transform`.")
for name, vectorizer in vectorizers.items():
print(f" >> Testing {name}")
deepcopy(vectorizer).fit_transform(X=[diag1[0], diag2[0], diag3[0]])


def test_transform_empty():
print(f" > Testing `transform_empty`.")
for name, vectorizer in vectorizers.items():
print(f" >> Testing {name}")
copy_vec = deepcopy(vectorizer).fit(X=[diag1[0], diag2[0]])
copy_vec.transform(X=[diag3[0], diag3[0]])


def test_set_output():
print(f" > Testing `set_output`.")
import pandas as pd
for name, vectorizer in vectorizers.items():
print(f" >> Testing {name}")
deepcopy(vectorizer).set_output(transform="pandas")


def test_compose():
print(f" > Testing composition with `sklearn.compose.ColumnTransformer`.")
from sklearn.compose import ColumnTransformer
for name, vectorizer in vectorizers.items():
print(f" >> Testing {name}")
ct = ColumnTransformer([
(f"{name}-0", deepcopy(vectorizer), 0),
(f"{name}-1", deepcopy(vectorizer), 1),
(f"{name}-2", deepcopy(vectorizer), 2)]
)
ct.fit_transform(X=[diag1, diag2])
Loading