-
Notifications
You must be signed in to change notification settings - Fork 0
/
OutliersDrop_1Col.py
159 lines (122 loc) · 6 KB
/
OutliersDrop_1Col.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
print("\n----- OUTLIERS DROP 1 COL | Z-SCORE + IQR -----\n")
lin = "\n-----------------------------------------\n"
# ----------------------------- THRESHOLDS
z_score_threshold = 2
iqr_threshold = 1.5
# ----------------------------- DATASETS
df = pd.read_csv(f"../Data/Guerry_HistData.csv")
# df = sm.datasets.get_rdataset("Guerry", "HistData").data
print(f'{df.info()}\n')
print(f'{df.describe()}')
shape = df.shape
# ----------------------------- RUN
print(lin)
col_choose = input("Set column for Outliers check:\n")
ploting = True
while ploting:
print(lin)
plot_choose = int(input(f"Choose outcome for -> {col_choose}\n"
f"(Z-THR = {z_score_threshold}, IQR-THR = {iqr_threshold}):\n\n"
"1. View -> Boxplot\n"
"2. Generate -> Z-Score + IQR\n"
"3. Drop outliers -> Z-Score method\n"
"4. Drop outliers -> IQR method\n"
"5. View methods 'Key Differences'\n"))
# ----------------------------- CHOOSE
if plot_choose == 1:
sns.boxplot(data=df[col_choose], width=.2)
sns.stripplot(data=df[col_choose], size=4, linewidth=0, color=".2")
plt.show()
elif plot_choose == 2:
# Z-Score Values
print(lin)
print("Z-Score values:")
z = round(np.abs(stats.zscore(df[col_choose])), 2)
print(z.tolist())
# IQR Values
Q1 = np.percentile(df[col_choose], 25, method='midpoint')
Q3 = np.percentile(df[col_choose], 75, method='midpoint')
IQR = Q3 - Q1
# Define upper and lower bounds
upper = Q3 + iqr_threshold * IQR
lower = Q1 - iqr_threshold * IQR
print(f"\nInter Quartile Range: {IQR}")
print(f"Upper Bound: {upper}")
print(f"Lower Bound: {lower}")
elif plot_choose == 3:
# Z-score Method:
# Approach: The Z-score method standardizes the data by measuring how many...
# standard deviations a data point is from the mean. Outliers are identified based on...
# a specified Z-score threshold. Typically, values with an absolute Z-score above...
# a certain threshold (e.g., 2 or 3) are considered outliers.
# Assumption: This method assumes that the data follows a normal distribution.
# It is more sensitive to extreme values compared to the IQR method.
z_scores = np.abs(stats.zscore(df[col_choose]))
outliers_mask = z_scores > z_score_threshold
# Removing outliers using Z-Score method
df_z = df[~outliers_mask]
# Print Shapes
print(lin)
print(f"----> Old Shape: {shape}")
print(f"----> New Shape (Z-Score): {df_z.shape}\n")
print(f'{df_z.info()}\n')
print(f'{df_z.describe()}\n\n')
print("New Dataframe -> df_z")
print(df_z.head())
# CSV Export
export_path = "../Data/df_z.csv"
ex_choose = input("\nExport to CSV? (Y/N)\n").lower()
if ex_choose == "y":
df_z.to_csv(export_path, index=False)
elif plot_choose == 4:
# Interquartile Range (IQR) Method:
# Approach: The IQR method is based on the quartiles of the data.
# It calculates the interquartile range, which is the range between the first quartile (Q1)...
# and the third quartile (Q3). Outliers are then identified as values ...
# that fall below Q1 - 1.5 * IQR or above Q3 + 1.5 * IQR.
# Assumption: This method assumes that the data follows a roughly...
# normal distribution and is less sensitive to extreme values.
Q1 = np.percentile(df[col_choose], 25, method='midpoint')
Q3 = np.percentile(df[col_choose], 75, method='midpoint')
IQR = Q3 - Q1
# Define upper and lower bounds
upper = Q3 + iqr_threshold * IQR
lower = Q1 - iqr_threshold * IQR
# Arrays to indicate outliers rows
upper_array = np.where(df[col_choose] >= upper)[0]
lower_array = np.where(df[col_choose] <= lower)[0]
# Create a new DataFrame df_q without outliers
df_q = df.drop(index=np.concatenate([upper_array, lower_array])).copy()
# Print Shapes
print(lin)
print(f"----> Old Shape: {shape}")
print(f"----> New Shape (IQR): {df_q.shape}\n")
print(f'{df_q.info()}\n')
print(f'{df_q.describe()}\n\n')
print("New Dataframe -> df_q")
print(df_q.head())
# CSV Export
export_path = "../Data/df_q.csv"
ex_choose = input("\nExport to CSV? (Y/N)\n").lower()
if ex_choose == "y":
df_q.to_csv(export_path, index=False)
elif plot_choose == 5:
print(lin)
print("Key Differences:\n"
"-> The Z-score method is more sensitive to extreme values and assumes a normal distribution.\n"
"-> Z-score is a parametric method that assumes the data follows a normal distribution.\n"
"-> Z-score is useful when the distribution of the data is known or approximately known,\n "
"while IQR is more robust in the presence of non-normality.\n\n"
"-> The IQR method is based on quartiles and is less affected by extreme values.\n"
"-> IQR is a non-parametric method, meaning it makes fewer assumptions about the underlying distribution of the data.\n\n"
"How to choose:\n"
"-> In practice, the choice between these methods depends on the characteristics of your data and the underlying assumptions you are willing to make.\n"
"-> It's often a good idea to compare results from both methods and choose the one that is more appropriate for your specific dataset.")
else:
ploting = False