-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest3ax.py
144 lines (122 loc) · 3.53 KB
/
test3ax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator
# Load the dataset
df = pd.read_csv("export_data.csv")
# # Define the features
# features = [
# "Duration",
# "Peak Intensity",
# "Peak Energy Bin",
# "Peak Energy In Bin",
# "Skewness",
# "Kurtosis",
# "Centroid",
# "Rise Time",
# "Decay Time",
# "Mean Time",
# "Std Time",
# "Peak Time",
# "Mean Energy",
# "Std Energy",
# "Total Energy Released",
# ]
features = [
"Duration",
#"Peak Intensity",
#"Peak Energy Bin",
#"Peak Energy In Bin",
"Skewness",
#"Kurtosis",
"Rise Time",
#"Decay Time",
#"Centroid",
#"Total Energy Released",
]
# Scale the data
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df[features])
# PCA-based feature ranking
def pca_feature_ranking(data, feature_names):
pca = PCA(n_components=len(feature_names)) # Perform PCA
pca.fit(data)
# Calculate feature importance as the sum of absolute contributions to all principal components
feature_importance = np.abs(pca.components_).sum(axis=0)
feature_ranking = sorted(
zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True
)
return feature_ranking
# Get ranked features
feature_ranking = pca_feature_ranking(df_scaled, features)
print("\nFeature Ranking based on PCA contributions:")
for feature, importance in feature_ranking:
print(f"{feature}: {importance:.4f}")
# Select the top 5 features
top_5_features = [feature for feature, _ in feature_ranking[:5]]
print("\nTop 5 Features Selected:")
print(top_5_features)
# Clustering with top 5 features
df_top_features = scaler.fit_transform(df[top_5_features])
# Elbow Method to find the optimal k
sse = []
k_rng = range(1, 15)
for k in k_rng:
km = KMeans(n_clusters=k, random_state=42)
km.fit(df_scaled)
sse.append(km.inertia_)
kneedle = KneeLocator(k_rng, sse, curve="convex", direction="decreasing")
optimal_k = kneedle.knee
if not optimal_k:
print("NOPE")
optimal_k = 4
# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_rng, sse, marker="o")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Sum of squared errors (SSE)")
plt.title("Elbow Method for Optimal k")
plt.xticks(range(1, len(k_rng) + 1))
plt.axvline(optimal_k, color="red", linestyle="--", label="Optimal k")
plt.show()
# Perform KMeans clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(df_top_features)
# Add cluster labels to the DataFrame
df["Cluster"] = clusters
# Visualize clusters in 3D using PCA
#pca = PCA(n_components=3)
#df_pca = pca.fit_transform(df_top_features)
# 3D plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")
# Plot each cluster in a different color
for cluster in range(optimal_k):
ax.scatter(
df_scaled[df["Cluster"] == cluster, 0],
df_scaled[df["Cluster"] == cluster, 1],
df_scaled[df["Cluster"] == cluster, 2],
label=f"Cluster {cluster}",
)
# Mark the centroids in PCA space
centroids = kmeans.cluster_centers_
ax.scatter(
centroids[:, 0],
centroids[:, 1],
centroids[:, 2],
s=200,
c="black",
marker="*",
label="Centroids",
)
# Labels and legend
x,y,z = features[0], features[1], features[2]
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
ax.legend()
#ax.set_title("3D Clusters Visualized with PCA")
plt.show()