-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
155 lines (117 loc) · 5.22 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.cluster.hierarchy as shc
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
# Function to load and preprocess the data
def preprocess_data(file):
data = pd.read_csv(file)
data = data.dropna()
# Replace "$null$" with NaN
data.replace("$null$", np.nan, inplace=True)
# Convert 'type' column to numeric
data["type"] = pd.to_numeric(data["type"], errors='coerce')
# Columns to convert to numeric
numeric_columns = [
'sales', 'resale', 'price', 'engine_s', 'horsepow',
'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap',
'mpg', 'lnsales'
]
# Convert the columns to numeric, coercing errors to NaN
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')
data["type"] = data["type"].apply(pd.to_numeric, errors='coerce')
# Replace missing values in the specified columns with the mean of those columns
for col in numeric_columns:
data[col].fillna(data[col].mean(), inplace=True)
# Perform one-hot encoding on 'manufact' and 'model' columns
data = pd.get_dummies(data, columns=['manufact', 'model'])
return data, numeric_columns
# Function to train the Isolation Forest model and perform hierarchical clustering
def train_model(data, numerical_features):
# Normalize numerical variables
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])
# Train Isolation Forest model
model = IsolationForest(contamination=0.05)
model.fit(data)
# Predict outliers
outliers = model.predict(data)
# Get outlier indices
outlier_indices = data.index[outliers == -1]
# Remove outliers from the DataFrame
data.drop(index=outlier_indices, inplace=True)
# Hierarchical clustering
dend = shc.linkage(data, method='ward')
return dend, data, scaler
# Main function to run the Streamlit app
def main():
st.title('Vehicle Clustering Analysis App')
# Allow user to upload a CSV file
uploaded_file = st.file_uploader("Upload CSV file", type=['csv'])
if uploaded_file is not None:
st.subheader('Uploaded Data')
# Preprocess the data
data, numerical_features = preprocess_data(uploaded_file)
# Display the first few rows of the data
st.write(data.head())
# Train model and perform clustering
dend, data, scaler = train_model(data, numerical_features)
st.subheader("Data Dimensions")
st.write(data.shape)
# Plot histograms for numerical features
st.subheader('Histograms')
num_cols = 3
num_rows = int(np.ceil(len(numerical_features) / num_cols))
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))
axes = axes.flatten()
for ax, col in zip(axes, numerical_features):
ax.hist(data[col], bins=20)
ax.set_title(col)
# Remove any empty subplots
for i in range(len(numerical_features), len(axes)):
fig.delaxes(axes[i])
st.pyplot(fig)
# Plot box plots for numerical features
st.subheader('Boxplots')
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(data=data[numerical_features], ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
st.pyplot(fig)
# Plot dendrogram
st.subheader('Dendrogram')
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Hierarchical Clustering Dendrogram')
dendrogram = shc.dendrogram(dend, ax=ax)
st.pyplot(fig)
# Number of clusters
unique_colors = set(dendrogram['color_list'])
number_of_clusters = len(unique_colors) - 1
st.subheader("Number of Clusters")
st.write(number_of_clusters)
# Hierarchical clustering with AgglomerativeClustering
agg_clustering = AgglomerativeClustering(n_clusters=number_of_clusters)
agg_clustering.fit(data)
# Retrieve the cluster labels
cluster_labels = agg_clustering.labels_
# Add the cluster labels to the DataFrame
data['cluster'] = cluster_labels
# Print the counts of each cluster
st.subheader("Cluster Counts")
st.write(data['cluster'].value_counts())
# PCA for visualization
st.subheader('PCA for Cluster Visualization')
pca = PCA(n_components=2)
principal_components = pca.fit_transform(data[numerical_features])
pca_df = pd.DataFrame(data=principal_components, columns=['Component 1', 'Component 2'])
pca_df['cluster'] = cluster_labels
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x='Component 1', y='Component 2', hue='cluster', data=pca_df, palette='viridis', ax=ax)
ax.set_title('Clusters visualized using PCA')
st.pyplot(fig)
if __name__ == "__main__":
main()