import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple, List
import numpy as np
def create_normal_distribution(mean, sd, size) -> np.array:
return np.random.normal(loc=mean, scale=sd,size=size)
def create_distribution(mean: float, size: int) -> pd.Series:
return create_normal_distribution(mean, mean * 0.25,size)
def generate_df(means: List[Tuple[float, float, str]], n: int) -> pd.DataFrame:
lists = [
(create_distribution(_x, n), create_distribution(_y, n), np.repeat(_l, n))
for _x, _y, _l in means
]
x = np.array([])
y = np.array([])
labels = np.array([])
for _x, _y, _l in lists:
x = np.concatenate((x, _x), axis=None)
y = np.concatenate((y, _y))
labels = np.concatenate((labels, _l))
return pd.DataFrame({"x": x, "y": y, "label": labels})
def get_cmap(n, name="hsv"):
"""Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
RGB color; the keyword argument name must be a standard mpl colormap name."""
return plt.colormaps.get_cmap(name)
def scatter_group_by(
file_path: str, df: pd.DataFrame, x_column: str, y_column: str, label_column: str
):
fig, ax = plt.subplots()
labels = pd.unique(df[label_column])
cmap = get_cmap(len(labels) + 1)
for i, label in enumerate(labels):
filter_df = df.query(f"{label_column} == '{label}'")
ax.scatter(filter_df[x_column], filter_df[y_column], label=label)
ax.legend()
plt.set_cmap(cmap)
plt.savefig(file_path)
plt.close()
def euclidean_distance(p_1: np.array, p_2: np.array) -> float:
return np.sqrt(np.sum((p_2 - p_1) ** 2))
def calculate_means(points: np.array, labels:np.array, clusters: int)-> np.array:
mean = []
for k in range(clusters):
m = np.mean(points[labels == k], axis=0)
mean.append(m)
return mean
def calculate_nearest_k(point: np.array, actual_means: List[np.array]):
distance = [euclidean_distance(mean, point) for mean in actual_means]
nearest_k = np.argmin(distance)
return (point, nearest_k)
def k_means(points: List[np.array], k: int):
N = len(points)
num_cluster = k
max_iterations = 15
x = np.array(points)
y = np.random.randint(0, num_cluster, N)
dimensions = len(points[0])
mean = np.zeros((num_cluster, dimensions))
# print(f"mean_init: {mean}")
for t in range(max_iterations):
actual_mean = calculate_means(points=x, labels=y, clusters=num_cluster)
# print(f"mean_fin: {mean}")
y = np.array([calculate_nearest_k(point=point, actual_means=actual_mean)[1] for point in x])
df_points = pd.DataFrame(x, columns=['x','y'])
df_points['label'] = np.char.mod('%d', y)
df_mean = pd.DataFrame(actual_mean, columns=['x','y'])
df_mean['label'] = ['centroid' for i in range(len(actual_mean))]
df = pd.concat([df_points, df_mean])
scatter_group_by(file_path=f"img/kmeans_{t}.png", df=df,x_column="x", y_column="y", label_column='label')
if np.array_equal(actual_mean, mean):
break
mean = actual_mean.copy()
return mean
groups = [(20, 20, "grupo0"), (300, 40, "grupo1"), (200, 200, "grupo2")]
df = generate_df(groups, 50)
print(f'df:{df}')
scatter_group_by("img/clusters.png", df, "x", "y", "label")
list_t = [
(np.array(tuples[0:2]), tuples[2])
for tuples in df.itertuples(index=False, name=None)
]
points = [point for point, _ in list_t]
labels = [label for _, label in list_t]
# np.random.seed(0)
kn = k_means(
points,
3 ,
)
print(kn)
[150 rows x 3 columns] [array([20.53165534, 21.09668023]), array([302.58371571, 41.61639607]), array([212.35660225, 197.49864075])]