-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Cleansing.py
30 lines (21 loc) · 1009 Bytes
/
Data Cleansing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
'''Basic Data Cleansing Techniques
df – converted pandas DataFrame table object'''
# --- Drop duplicate values ---
df = df.drop_duplicates()
# --- Drop all records (rows) where feature value does not equal to a particular value ---
df = df[df.<feature> != <value>]
# --- Drop columns from the dataset ---
df.drop([<column1>, <column2>], axis = 1, inplace=True)
# --- Display unique values of feature ---
df.<feature>.unique()
# --- Fill missing feature values with (0, 1, string, etc.) ---
df.<feature>.fillna(<value_fill>, inplace = True)
# --- Replace title for another ---
df.replace(<title_to_replace>, <updated_title>, inplace=True)
# --- Replace titles for another ---
df.replace(<[title1, title2, ...]>, <new_title>, inplace=True)
# --- Display number of missing values by feature (categorical) ---
df.select_dtypes(include=['object']).isnull().sum()
# --- Fill missing categorical values ---
for column in df.select_dtypes(include=['object']):
df[column] = df[column].fillna('Missing')